aboutsummaryrefslogtreecommitdiff
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/docs/AMDGPUUsage.rst2
-rw-r--r--llvm/docs/CodeOfConduct.rst1
-rw-r--r--llvm/docs/CommandGuide/dsymutil.rst8
-rw-r--r--llvm/docs/LangRef.rst25
-rw-r--r--llvm/docs/SPIRVUsage.rst2
-rw-r--r--llvm/docs/TableGen/BackEnds.rst50
-rw-r--r--llvm/include/llvm-c/DebugInfo.h24
-rw-r--r--llvm/include/llvm/ADT/APFloat.h152
-rw-r--r--llvm/include/llvm/ADT/DenseMap.h2
-rw-r--r--llvm/include/llvm/ADT/DepthFirstIterator.h18
-rw-r--r--llvm/include/llvm/ADT/ImmutableSet.h6
-rw-r--r--llvm/include/llvm/ADT/PostOrderIterator.h6
-rw-r--r--llvm/include/llvm/ADT/STLExtras.h2
-rw-r--r--llvm/include/llvm/ADT/STLForwardCompat.h48
-rw-r--r--llvm/include/llvm/ADT/SmallPtrSet.h12
-rw-r--r--llvm/include/llvm/ADT/bit.h42
-rw-r--r--llvm/include/llvm/Analysis/LoopAnalysisManager.h2
-rw-r--r--llvm/include/llvm/Analysis/LoopInfo.h2
-rw-r--r--llvm/include/llvm/Analysis/ScalarEvolution.h1
-rw-r--r--llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h12
-rw-r--r--llvm/include/llvm/IR/DebugProgramInstruction.h10
-rw-r--r--llvm/include/llvm/IR/Value.h4
-rw-r--r--llvm/include/llvm/LTO/LTO.h6
-rw-r--r--llvm/include/llvm/Support/Alignment.h2
-rw-r--r--llvm/include/llvm/Support/Casting.h7
-rw-r--r--llvm/include/llvm/Support/CommandLine.h2
-rw-r--r--llvm/include/llvm/Support/DOTGraphTraits.h5
-rw-r--r--llvm/include/llvm/Support/ELFAttributes.h2
-rw-r--r--llvm/include/llvm/Support/LSP/Protocol.h2
-rw-r--r--llvm/include/llvm/Support/MD5.h2
-rw-r--r--llvm/include/llvm/Support/MathExtras.h53
-rw-r--r--llvm/include/llvm/Support/Timer.h2
-rw-r--r--llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h1
-rw-r--r--llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h10
-rw-r--r--llvm/include/llvm/Transforms/Scalar/LoopPassManager.h15
-rwxr-xr-xllvm/lib/Analysis/ConstantFolding.cpp56
-rw-r--r--llvm/lib/Analysis/LazyValueInfo.cpp10
-rw-r--r--llvm/lib/Analysis/LoopInfo.cpp4
-rw-r--r--llvm/lib/Analysis/ScalarEvolution.cpp166
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp55
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp59
-rw-r--r--llvm/lib/IR/DebugInfo.cpp43
-rw-r--r--llvm/lib/IR/Verifier.cpp5
-rw-r--r--llvm/lib/LTO/LTO.cpp34
-rw-r--r--llvm/lib/LTO/LTOBackend.cpp1
-rw-r--r--llvm/lib/Passes/PassBuilder.cpp33
-rw-r--r--llvm/lib/Passes/PassBuilderPipelines.cpp56
-rw-r--r--llvm/lib/Passes/PassRegistry.def1
-rw-r--r--llvm/lib/Remarks/BitstreamRemarkParser.h4
-rw-r--r--llvm/lib/Support/APFloat.cpp538
-rw-r--r--llvm/lib/Support/SourceMgr.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td6
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td6
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h9
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h9
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp47
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp29
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td3
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp3
-rw-r--r--llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp3
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp22
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td28
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td726
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.h5
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp20
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp10
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp32
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td4
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp74
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp20
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp22
-rw-r--r--llvm/lib/Target/X86/X86MCInstLower.cpp31
-rw-r--r--llvm/lib/TargetParser/RISCVISAInfo.cpp2
-rw-r--r--llvm/lib/Transforms/CFGuard/CFGuard.cpp25
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp39
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp4
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp42
-rw-r--r--llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp20
-rw-r--r--llvm/lib/Transforms/Scalar/LoopFuse.cpp34
-rw-r--r--llvm/lib/Transforms/Scalar/LoopPassManager.cpp5
-rw-r--r--llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp59
-rw-r--r--llvm/lib/Transforms/Scalar/Reg2Mem.cpp6
-rw-r--r--llvm/lib/Transforms/Scalar/SROA.cpp34
-rw-r--r--llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp40
-rw-r--r--llvm/lib/Transforms/Utils/SCCPSolver.cpp96
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp111
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h12
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp8
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp9
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.h2
-rw-r--r--llvm/test/Bindings/llvm-c/debug_info_new_format.ll107
-rw-r--r--llvm/test/CodeGen/AMDGPU/abs_i16.ll980
-rw-r--r--llvm/test/CodeGen/AMDGPU/add.v2i16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll22342
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll2356
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll5894
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll1242
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll768
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll362
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll7815
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll2484
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll4594
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll1340
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll4962
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll5336
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll5688
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll6014
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll6338
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll1411
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll140
-rw-r--r--llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/bypass-div.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll5
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir87
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll335
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg.bf16.ll123
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptosi.f16.ll7
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptoui.f16.ll7
-rw-r--r--llvm/test/CodeGen/AMDGPU/frem.ll65
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i8.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdiv.ll788
-rw-r--r--llvm/test/CodeGen/AMDGPU/select.f16.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem.ll26
-rw-r--r--llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/sub.v2i16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll29
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll20
-rw-r--r--llvm/test/CodeGen/NVPTX/i32x2-instructions.ll167
-rw-r--r--llvm/test/CodeGen/RISCV/attributes.ll4
-rw-r--r--llvm/test/CodeGen/RISCV/div_minsize.ll148
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/mixed-float-bf16-arith.ll186
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfadd-bf.ll607
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfclass-bf.ll294
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfmacc-bf.ll553
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfmadd-bf.ll553
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfmax-bf.ll571
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfmerge-bf.ll258
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfmin-bf.ll571
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfmsac-bf.ll553
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfmsub-bf.ll553
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfmul-bf.ll607
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfmv-bf-s.ll88
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfmv-s-bf.ll161
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfmv-v-bf.ll216
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfncvt-rod-bf-f.ll226
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-x-bf.ll270
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-xu-bf.ll270
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfncvt-x-bf.ll288
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfncvt-xu-bf.ll288
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfnmacc-bf.ll553
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfnmadd-bf.ll553
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfnmsac-bf.ll553
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfnmsub-bf.ll553
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfrec7-bf.ll282
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-bf16.ll264
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfrsub-bf.ll282
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfsgnj-bf.ll571
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfsgnjn-bf.ll571
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfsgnjx-bf.ll571
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfslide1down-bf.ll288
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfslide1up-bf.ll294
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfsub-bf.ll559
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfwadd-bf.ll519
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfwadd-w-bf.ll773
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfwcvt-bf-x.ll264
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfwcvt-bf-xu.ll264
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfwmsac-bf.ll506
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfwmul-bf.ll519
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfwnmacc-bf.ll506
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfwnmsac-bf.ll506
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfwsub-bf.ll519
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vfwsub-w-bf.ll773
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vmfeq-bf.ll496
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vmfge-bf.ll496
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vmfgt-bf.ll496
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vmfle-bf.ll496
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vmflt-bf.ll496
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vmfne-bf.ll496
-rw-r--r--llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_maximal_reconvergence/enable-maximal-reconvergence.ll21
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWBufferDynamicIdx.ll22
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWStructuredBufferDynamicIdx.ll21
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWBufferNonUniformIdx.ll (renamed from llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/StructuredBufferNonUniformIdx.ll)0
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWStructuredBufferNonUniformIdx.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageConstIdx.ll (renamed from llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll)2
-rw-r--r--llvm/test/CodeGen/X86/avx-shift.ll2
-rw-r--r--llvm/test/CodeGen/X86/avx2-arith.ll4
-rw-r--r--llvm/test/CodeGen/X86/combine-mul.ll22
-rw-r--r--llvm/test/CodeGen/X86/combine-multiplies.ll4
-rw-r--r--llvm/test/CodeGen/X86/combine-pmuldq.ll24
-rw-r--r--llvm/test/CodeGen/X86/combine-rotates.ll4
-rw-r--r--llvm/test/CodeGen/X86/combine-sdiv.ll8
-rw-r--r--llvm/test/CodeGen/X86/combine-shl.ll54
-rw-r--r--llvm/test/CodeGen/X86/combine-srem.ll10
-rw-r--r--llvm/test/CodeGen/X86/combine-udiv.ll10
-rw-r--r--llvm/test/CodeGen/X86/combine-umax.ll2
-rw-r--r--llvm/test/CodeGen/X86/combine-umin.ll2
-rw-r--r--llvm/test/CodeGen/X86/combine-urem.ll4
-rw-r--r--llvm/test/CodeGen/X86/dagcombine-shifts.ll4
-rw-r--r--llvm/test/CodeGen/X86/funnel-shift.ll8
-rw-r--r--llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll8
-rw-r--r--llvm/test/CodeGen/X86/known-pow2.ll6
-rw-r--r--llvm/test/CodeGen/X86/madd.ll8
-rw-r--r--llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll16
-rw-r--r--llvm/test/CodeGen/X86/pmul.ll2
-rw-r--r--llvm/test/CodeGen/X86/pr162812.ll50
-rw-r--r--llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll2
-rw-r--r--llvm/test/CodeGen/X86/rotate-extract-vector.ll38
-rw-r--r--llvm/test/CodeGen/X86/sdiv-exact.ll18
-rw-r--r--llvm/test/CodeGen/X86/shrink_vmul.ll20
-rw-r--r--llvm/test/CodeGen/X86/slow-pmulld.ll8
-rw-r--r--llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll388
-rw-r--r--llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll24
-rw-r--r--llvm/test/CodeGen/X86/udiv-exact.ll18
-rw-r--r--llvm/test/CodeGen/X86/undo-mul-and.ll18
-rw-r--r--llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll8
-rw-r--r--llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll470
-rw-r--r--llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll20
-rw-r--r--llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll36
-rw-r--r--llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll20
-rw-r--r--llvm/test/CodeGen/X86/var-permute-128.ll8
-rw-r--r--llvm/test/CodeGen/X86/vec_reassociate.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-compress.ll70
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-128.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-256.ll4
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-rot-128.ll16
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-rot-256.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll16
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-sub128.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-128.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-256.ll4
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-rot-128.ll16
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-rot-256.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll16
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll36
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll32
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll32
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll56
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll56
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll56
-rw-r--r--llvm/test/CodeGen/X86/vector-mul.ll100
-rw-r--r--llvm/test/CodeGen/X86/vector-rotate-128.ll16
-rw-r--r--llvm/test/CodeGen/X86/vector-rotate-256.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-shl-128.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-shl-256.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc-math.ll92
-rw-r--r--llvm/test/CodeGen/X86/vselect-avx.ll18
-rw-r--r--llvm/test/CodeGen/X86/vselect-pcmp.ll6
-rw-r--r--llvm/test/CodeGen/X86/zero-call-used-regs-simd.ll216
-rw-r--r--llvm/test/DebugInfo/Generic/compileunit-source-language-name.ll10
-rw-r--r--llvm/test/DebugInfo/X86/shrink-wrap-frame-setup-no-loc.mir99
-rw-r--r--llvm/test/Instrumentation/AddressSanitizer/asan-win-dont-instrument-catchpad.ll63
-rw-r--r--llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll21
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vds_alias.s12
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vflat_alias.s75
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx8_vop3cx_nowarn.txt422
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3c_nowarn.txt402
-rw-r--r--llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt8
-rw-r--r--llvm/test/MC/X86/apx/pushp-popp-att.s8
-rw-r--r--llvm/test/Other/loop-pm-invalidation.ll30
-rw-r--r--llvm/test/Other/new-pm-defaults.ll1
-rw-r--r--llvm/test/Other/new-pm-thinlto-postlink-defaults.ll1
-rw-r--r--llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll1
-rw-r--r--llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll1
-rw-r--r--llvm/test/Other/new-pm-thinlto-prelink-defaults.ll1
-rw-r--r--llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll1
-rw-r--r--llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll1
-rw-r--r--llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll18
-rw-r--r--llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll173
-rw-r--r--llvm/test/Transforms/InstCombine/icmp-trunc.ll30
-rw-r--r--llvm/test/Transforms/InstCombine/scmp.ll261
-rw-r--r--llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll1
-rw-r--r--llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll17
-rw-r--r--llvm/test/Transforms/InstSimplify/ConstProp/bitreverse.ll51
-rw-r--r--llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll17
-rw-r--r--llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll1
-rw-r--r--llvm/test/Transforms/LoopPredication/preserve-bpi.ll60
-rw-r--r--llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll12
-rw-r--r--llvm/test/Transforms/LoopVectorize/pr48832.ll2
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll174
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll162
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll96
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll96
-rw-r--r--llvm/test/Transforms/PhaseOrdering/unswitch-cold-func.ll (renamed from llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll)9
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/last-non-copyable-inst-used-outside-bb.ll89
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll1
-rw-r--r--llvm/test/Verifier/matrix-intrinsics.ll23
-rw-r--r--llvm/test/tools/llvm-mca/RISCV/SiFive7/vrgather-vcompress.s (renamed from llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s)0
-rw-r--r--llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vrgather-vcompress.s (renamed from llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vgather-vcompress.s)0
-rw-r--r--llvm/test/tools/llvm-reduce/reduce-instructions-alloca.ll16
-rw-r--r--llvm/tools/llvm-c-test/debuginfo.c5
-rw-r--r--llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp6
-rw-r--r--llvm/unittests/ADT/BitTest.cpp16
-rw-r--r--llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp51
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp2
-rw-r--r--llvm/utils/TableGen/X86DisassemblerTables.cpp11
-rw-r--r--llvm/utils/TableGen/X86RecognizableInstr.cpp2
-rw-r--r--llvm/utils/gn/secondary/lld/test/BUILD.gn7
-rw-r--r--llvm/utils/profcheck-xfail.txt1
-rw-r--r--[-rwxr-xr-x]llvm/utils/release/build_llvm_release.bat150
320 files changed, 80553 insertions, 31403 deletions
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 8193adc..e062032 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -883,6 +883,8 @@ supported for the ``amdgcn`` target.
Buffer Fat Pointer 7 N/A N/A 160 0
Buffer Resource 8 N/A V# 128 0x00000000000000000000000000000000
Buffer Strided Pointer (experimental) 9 *TODO*
+ *reserved for downstream use* 10
+ *reserved for downstream use* 11
Streamout Registers 128 N/A GS_REGS
===================================== =============== =========== ================ ======= ============================
diff --git a/llvm/docs/CodeOfConduct.rst b/llvm/docs/CodeOfConduct.rst
index 645ae12..995d32b 100644
--- a/llvm/docs/CodeOfConduct.rst
+++ b/llvm/docs/CodeOfConduct.rst
@@ -171,6 +171,7 @@ The current committee members are:
Transparency Reports
====================
+* `July 15, 2025 <https://discourse.llvm.org/t/llvm-code-of-conduct-transparency-report-july-15-2024-july-15-2025/88622>`_
* `July 15, 2024 <https://discourse.llvm.org/t/llvm-code-of-conduct-transparency-report-july-15-2023-july-15-2024/82687>`_
* `July 15, 2023 <https://llvm.org/coc-reports/2023-07-15-report.html>`_
* `July 15, 2022 <https://llvm.org/coc-reports/2022-07-15-report.html>`_
diff --git a/llvm/docs/CommandGuide/dsymutil.rst b/llvm/docs/CommandGuide/dsymutil.rst
index 8764e1f..8e61e01 100644
--- a/llvm/docs/CommandGuide/dsymutil.rst
+++ b/llvm/docs/CommandGuide/dsymutil.rst
@@ -75,14 +75,6 @@ OPTIONS
Make a static variable keep the enclosing function even if it would have been
omitted otherwise.
-.. option:: --minimize, -z
-
- When used when creating a dSYM file, this option will suppress the emission of
- the .debug_inlines, .debug_pubnames, and .debug_pubtypes sections since
- dsymutil currently has better equivalents: .apple_names and .apple_types. When
- used in conjunction with ``--update`` option, this option will cause redundant
- accelerator tables to be removed.
-
.. option:: --no-object-timestamp
Don't check timestamp for object files.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 0c54f57..5b4b53d 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -21062,12 +21062,15 @@ integer element type.
Syntax:
"""""""
-This is an overloaded intrinsic.
+This is an overloaded intrinsic. You can use ``llvm.matrix.column.major.load``
+to load any vector type with a stride of any bitwidth up to 64.
::
- declare vectorty @llvm.matrix.column.major.load.*(
+ declare <4 x i32> @llvm.matrix.column.major.load.v4i32.i64(
ptrty %Ptr, i64 %Stride, i1 <IsVolatile>, i32 <Rows>, i32 <Cols>)
+ declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(
+ ptrty %Ptr, i32 %Stride, i1 <IsVolatile>, i32 <Rows>, i32 <Cols>)
Overview:
"""""""""
@@ -21086,9 +21089,9 @@ Arguments:
The first argument ``%Ptr`` is a pointer type to the returned vector type, and
corresponds to the start address to load from. The second argument ``%Stride``
-is a positive, constant integer with ``%Stride >= <Rows>``. ``%Stride`` is used
-to compute the column memory addresses. I.e., for a column ``C``, its start
-memory addresses is calculated with ``%Ptr + C * %Stride``. The third Argument
+is a positive integer for which ``%Stride >= <Rows>``. ``%Stride`` is used to
+compute the column memory addresses. I.e., for a column ``C``, its start memory
+addresses is calculated with ``%Ptr + C * %Stride``. The third Argument
``<IsVolatile>`` is a boolean value. The fourth and fifth arguments,
``<Rows>`` and ``<Cols>``, correspond to the number of rows and columns,
respectively, and must be positive, constant integers. The returned vector must
@@ -21103,11 +21106,17 @@ The :ref:`align <attr_align>` parameter attribute can be provided for the
Syntax:
"""""""
+This is an overloaded intrinsic. ``llvm.matrix.column.major.store`` to store
+any vector type with a stride of any bitwidth up to 64.
::
- declare void @llvm.matrix.column.major.store.*(
- vectorty %In, ptrty %Ptr, i64 %Stride, i1 <IsVolatile>, i32 <Rows>, i32 <Cols>)
+ declare void @llvm.matrix.column.major.store.v4i32.i64(
+ <4 x i32> %In, ptrty %Ptr, i64 %Stride, i1 <IsVolatile>, i32 <Rows>,
+ i32 <Cols>)
+ declare void @llvm.matrix.column.major.store.v9f64.i32(
+ <9 x double> %In, ptrty %Ptr, i32 %Stride, i1 <IsVolatile>, i32
+ <Rows>, i32 <Cols>)
Overview:
"""""""""
@@ -21127,7 +21136,7 @@ Arguments:
The first argument ``%In`` is a vector that corresponds to a ``<Rows> x
<Cols>`` matrix to be stored to memory. The second argument ``%Ptr`` is a
pointer to the vector type of ``%In``, and is the start address of the matrix
-in memory. The third argument ``%Stride`` is a positive, constant integer with
+in memory. The third argument ``%Stride`` is a positive integer for which
``%Stride >= <Rows>``. ``%Stride`` is used to compute the column memory
addresses. I.e., for a column ``C``, its start memory addresses is calculated
with ``%Ptr + C * %Stride``. The fourth argument ``<IsVolatile>`` is a boolean
diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index d2d6646..85eeabf 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -235,6 +235,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
- Adds execution modes and decorations to control floating-point computations in both kernels and shaders. It can be used on whole modules and individual instructions.
* - ``SPV_INTEL_predicated_io``
- Adds predicated load and store instructions that conditionally read from or write to memory based on a boolean predicate.
+ * - ``SPV_KHR_maximal_reconvergence``
+ - Adds execution mode and capability to enable maximal reconvergence.
SPIR-V representation in LLVM IR
================================
diff --git a/llvm/docs/TableGen/BackEnds.rst b/llvm/docs/TableGen/BackEnds.rst
index 14232bc..7f57137 100644
--- a/llvm/docs/TableGen/BackEnds.rst
+++ b/llvm/docs/TableGen/BackEnds.rst
@@ -48,7 +48,7 @@ the TableGen files, the back-ends and their users.
For instance, a global contract is that each back-end produces macro-guarded
sections. Based on whether the file is included by a header or a source file,
or even in which context of each file the include is being used, you have
-todefine a macro just before including it, to get the right output:
+to define a macro just before including it, to get the right output:
.. code-block:: c++
@@ -80,8 +80,8 @@ in the TableGen files.
CodeEmitter
-----------
-**Purpose**: CodeEmitterGen uses the descriptions of instructions and their fields to
-construct an automated code emitter: a function that, given a MachineInstr,
+**Purpose**: ``CodeEmitterGen`` uses the descriptions of instructions and their fields to
+construct an automated code emitter: a function that, given a ``MachineInstr``,
returns the (currently, 32-bit unsigned) value of the instruction.
**Output**: C++ code, implementing the target's CodeEmitter
@@ -130,7 +130,7 @@ AsmMatcher
----------
**Purpose**: Emits a target specifier matcher for
-converting parsed assembly operands in the MCInst structures. It also
+converting parsed assembly operands in the ``MCInst`` structures. It also
emits a matcher for custom operand parsing. Extensive documentation is
written on the ``AsmMatcherEmitter.cpp`` file.
@@ -167,7 +167,7 @@ CallingConv
conventions supported by this target.
**Output**: Implement static functions to deal with calling conventions
-chained by matching styles, returning false on no match.
+chained by matching styles, returning ``false`` on no match.
**Usage**: Used in ISelLowering and FastIsel as function pointers to
implementation returned by a CC selection function.
@@ -200,7 +200,7 @@ FastISel
**Purpose**: This tablegen backend emits code for use by the "fast"
instruction selection algorithm. See the comments at the top of
-lib/CodeGen/SelectionDAG/FastISel.cpp for background. This file
+``lib/CodeGen/SelectionDAG/FastISel.cpp`` for background. This file
scans through the target's tablegen instruction-info files
and extracts instructions with obvious-looking patterns, and it emits
code to look up these instructions by type and operator.
@@ -270,23 +270,23 @@ This file is included as part of ``Attr.h``.
ClangAttrParserStringSwitches
-----------------------------
-**Purpose**: Creates AttrParserStringSwitches.inc, which contains
-StringSwitch::Case statements for parser-related string switches. Each switch
+**Purpose**: Creates ``AttrParserStringSwitches.inc``, which contains
+``StringSwitch::Case`` statements for parser-related string switches. Each switch
is given its own macro (such as ``CLANG_ATTR_ARG_CONTEXT_LIST``, or
``CLANG_ATTR_IDENTIFIER_ARG_LIST``), which is expected to be defined before
-including AttrParserStringSwitches.inc, and undefined after.
+including ``AttrParserStringSwitches.inc``, and undefined after.
ClangAttrImpl
-------------
-**Purpose**: Creates AttrImpl.inc, which contains semantic attribute class
+**Purpose**: Creates ``AttrImpl.inc``, which contains semantic attribute class
definitions for any attribute in ``Attr.td`` that has not set ``ASTNode = 0``.
This file is included as part of ``AttrImpl.cpp``.
ClangAttrList
-------------
-**Purpose**: Creates AttrList.inc, which is used when a list of semantic
+**Purpose**: Creates ``AttrList.inc``, which is used when a list of semantic
attribute identifiers is required. For instance, ``AttrKinds.h`` includes this
file to generate the list of ``attr::Kind`` enumeration values. This list is
separated out into multiple categories: attributes, inheritable attributes, and
@@ -297,25 +297,25 @@ functionality required for ``dyn_cast`` and similar APIs.
ClangAttrPCHRead
----------------
-**Purpose**: Creates AttrPCHRead.inc, which is used to deserialize attributes
+**Purpose**: Creates ``AttrPCHRead.inc``, which is used to deserialize attributes
in the ``ASTReader::ReadAttributes`` function.
ClangAttrPCHWrite
-----------------
-**Purpose**: Creates AttrPCHWrite.inc, which is used to serialize attributes in
+**Purpose**: Creates ``AttrPCHWrite.inc``, which is used to serialize attributes in
the ``ASTWriter::WriteAttributes`` function.
ClangAttrSpellings
---------------------
-**Purpose**: Creates AttrSpellings.inc, which is used to implement the
+**Purpose**: Creates ``AttrSpellings.inc``, which is used to implement the
``__has_attribute`` feature test macro.
ClangAttrSpellingListIndex
--------------------------
-**Purpose**: Creates AttrSpellingListIndex.inc, which is used to map parsed
+**Purpose**: Creates ``AttrSpellingListIndex.inc``, which is used to map parsed
attribute spellings (including which syntax or scope was used) to an attribute
spelling list index. These spelling list index values are internal
implementation details exposed via
@@ -324,26 +324,26 @@ implementation details exposed via
ClangAttrVisitor
-------------------
-**Purpose**: Creates AttrVisitor.inc, which is used when implementing
+**Purpose**: Creates ``AttrVisitor.inc``, which is used when implementing
recursive AST visitors.
ClangAttrTemplateInstantiate
----------------------------
-**Purpose**: Creates AttrTemplateInstantiate.inc, which implements the
+**Purpose**: Creates ``AttrTemplateInstantiate.inc``, which implements the
``instantiateTemplateAttribute`` function, used when instantiating a template
that requires an attribute to be cloned.
ClangAttrParsedAttrList
-----------------------
-**Purpose**: Creates AttrParsedAttrList.inc, which is used to generate the
+**Purpose**: Creates ``AttrParsedAttrList.inc``, which is used to generate the
``AttributeList::Kind`` parsed attribute enumeration.
ClangAttrParsedAttrImpl
-----------------------
-**Purpose**: Creates AttrParsedAttrImpl.inc, which is used by
+**Purpose**: Creates ``AttrParsedAttrImpl.inc``, which is used by
``AttributeList.cpp`` to implement several functions on the ``AttributeList``
class. This functionality is implemented via the ``AttrInfoMap ParsedAttrInfo``
array, which contains one element per parsed attribute object.
@@ -351,14 +351,14 @@ array, which contains one element per parsed attribute object.
ClangAttrParsedAttrKinds
------------------------
-**Purpose**: Creates AttrParsedAttrKinds.inc, which is used to implement the
+**Purpose**: Creates ``AttrParsedAttrKinds.inc``, which is used to implement the
``AttributeList::getKind`` function, mapping a string (and syntax) to a parsed
attribute ``AttributeList::Kind`` enumeration.
ClangAttrDump
-------------
-**Purpose**: Creates AttrDump.inc, which dumps information about an attribute.
+**Purpose**: Creates ``AttrDump.inc``, which dumps information about an attribute.
It is used to implement ``ASTDumper::dumpAttr``.
ClangDiagsDefs
@@ -424,7 +424,7 @@ Generate list of commands that are used in documentation comments.
ArmNeon
-------
-Generate arm_neon.h for clang.
+Generate ``arm_neon.h`` for clang.
ArmNeonSema
-----------
@@ -473,7 +473,7 @@ to a built-in backend.
**Output**:
-The root of the output file is a JSON object (i.e. dictionary),
+The root of the output file is a JSON object (i.e., dictionary),
containing the following fixed keys:
* ``!tablegen_json_version``: a numeric version field that will
@@ -520,7 +520,7 @@ conventions described below.
Some TableGen data types are translated directly into the
corresponding JSON type:
-* A completely undefined value (e.g. for a variable declared without
+* A completely undefined value (e.g., for a variable declared without
initializer in some superclass of this record, and never initialized
by the record itself or any other superclass) is emitted as the JSON
``null`` value.
@@ -964,7 +964,7 @@ Here is the modified lookup function.
The new lookup function will return an iterator range with first pointer to the
first result and the last pointer to the last matching result from the table.
-However, please note that the support for emitting modified definition exists
+However, please note that the support for emitting a modified definition exists
for ``PrimaryKeyName`` only.
The ``PrimaryKeyEarlyOut`` field, when set to 1, modifies the lookup
diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h
index 2ecd69a..70da3a6 100644
--- a/llvm/include/llvm-c/DebugInfo.h
+++ b/llvm/include/llvm-c/DebugInfo.h
@@ -204,6 +204,11 @@ enum {
typedef unsigned LLVMMetadataKind;
/**
+ * The kind of checksum to emit.
+ */
+typedef enum { CSK_MD5, CSK_SHA1, CSK_SHA256 } LLVMChecksumKind;
+
+/**
* An LLVM DWARF type encoding.
*/
typedef unsigned LLVMDWARFTypeEncoding;
@@ -327,6 +332,25 @@ LLVM_C_ABI LLVMMetadataRef LLVMDIBuilderCreateFile(LLVMDIBuilderRef Builder,
size_t DirectoryLen);
/**
+ * Create a file descriptor to hold debugging information for a file.
+ * \param Builder The \c DIBuilder.
+ * \param Filename File name.
+ * \param FilenameLen The length of the C string passed to \c Filename.
+ * \param Directory Directory.
+ * \param DirectoryLen The length of the C string passed to \c Directory.
+ * \param ChecksumKind The kind of checksum. eg MD5, SHA256
+ * \param Checksum The checksum.
+ * \param ChecksumLen The length of the checksum.
+ * \param Souce The embedded source.
+ * \param SourceLen The length of the source.
+ */
+LLVM_C_ABI LLVMMetadataRef LLVMDIBuilderCreateFileWithChecksum(
+ LLVMDIBuilderRef Builder, const char *Filename, size_t FilenameLen,
+ const char *Directory, size_t DirectoryLen, LLVMChecksumKind ChecksumKind,
+ const char *Checksum, size_t ChecksumLen, const char *Source,
+ size_t SourceLen);
+
+/**
* Creates a new descriptor for a module with the specified parent scope.
* \param Builder The \c DIBuilder.
* \param ParentScope The parent scope containing this module declaration.
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index a1bfce7..bccdb89 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -138,10 +138,16 @@ enum lostFraction { // Example of truncated bits:
/// New operations: sqrt, IEEE remainder, C90 fmod, nexttoward.
///
+namespace detail {
+class IEEEFloat;
+class DoubleAPFloat;
+} // namespace detail
+
// This is the common type definitions shared by APFloat and its internal
// implementation classes. This struct should not define any non-static data
// members.
-struct APFloatBase {
+class APFloatBase {
+public:
typedef APInt::WordType integerPart;
static constexpr unsigned integerPartWidth = APInt::APINT_BITS_PER_WORD;
@@ -257,30 +263,64 @@ struct APFloatBase {
LLVM_ABI static const llvm::fltSemantics &EnumToSemantics(Semantics S);
LLVM_ABI static Semantics SemanticsToEnum(const llvm::fltSemantics &Sem);
- LLVM_ABI static const fltSemantics &IEEEhalf() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &BFloat() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &IEEEsingle() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &IEEEdouble() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &IEEEquad() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &PPCDoubleDouble() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &PPCDoubleDoubleLegacy() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &Float8E5M2() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &Float8E5M2FNUZ() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &Float8E4M3() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &Float8E4M3FN() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &Float8E3M4() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &FloatTF32() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &Float8E8M0FNU() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &Float4E2M1FN() LLVM_READNONE;
- LLVM_ABI static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
+private:
+ LLVM_ABI static const fltSemantics semIEEEhalf;
+ LLVM_ABI static const fltSemantics semBFloat;
+ LLVM_ABI static const fltSemantics semIEEEsingle;
+ LLVM_ABI static const fltSemantics semIEEEdouble;
+ LLVM_ABI static const fltSemantics semIEEEquad;
+ LLVM_ABI static const fltSemantics semFloat8E5M2;
+ LLVM_ABI static const fltSemantics semFloat8E5M2FNUZ;
+ LLVM_ABI static const fltSemantics semFloat8E4M3;
+ LLVM_ABI static const fltSemantics semFloat8E4M3FN;
+ LLVM_ABI static const fltSemantics semFloat8E4M3FNUZ;
+ LLVM_ABI static const fltSemantics semFloat8E4M3B11FNUZ;
+ LLVM_ABI static const fltSemantics semFloat8E3M4;
+ LLVM_ABI static const fltSemantics semFloatTF32;
+ LLVM_ABI static const fltSemantics semFloat8E8M0FNU;
+ LLVM_ABI static const fltSemantics semFloat6E3M2FN;
+ LLVM_ABI static const fltSemantics semFloat6E2M3FN;
+ LLVM_ABI static const fltSemantics semFloat4E2M1FN;
+ LLVM_ABI static const fltSemantics semX87DoubleExtended;
+ LLVM_ABI static const fltSemantics semBogus;
+ LLVM_ABI static const fltSemantics semPPCDoubleDouble;
+ LLVM_ABI static const fltSemantics semPPCDoubleDoubleLegacy;
+
+ friend class detail::IEEEFloat;
+ friend class detail::DoubleAPFloat;
+ friend class APFloat;
+
+public:
+ static const fltSemantics &IEEEhalf() { return semIEEEhalf; }
+ static const fltSemantics &BFloat() { return semBFloat; }
+ static const fltSemantics &IEEEsingle() { return semIEEEsingle; }
+ static const fltSemantics &IEEEdouble() { return semIEEEdouble; }
+ static const fltSemantics &IEEEquad() { return semIEEEquad; }
+ static const fltSemantics &PPCDoubleDouble() { return semPPCDoubleDouble; }
+ static const fltSemantics &PPCDoubleDoubleLegacy() {
+ return semPPCDoubleDoubleLegacy;
+ }
+ static const fltSemantics &Float8E5M2() { return semFloat8E5M2; }
+ static const fltSemantics &Float8E5M2FNUZ() { return semFloat8E5M2FNUZ; }
+ static const fltSemantics &Float8E4M3() { return semFloat8E4M3; }
+ static const fltSemantics &Float8E4M3FN() { return semFloat8E4M3FN; }
+ static const fltSemantics &Float8E4M3FNUZ() { return semFloat8E4M3FNUZ; }
+ static const fltSemantics &Float8E4M3B11FNUZ() {
+ return semFloat8E4M3B11FNUZ;
+ }
+ static const fltSemantics &Float8E3M4() { return semFloat8E3M4; }
+ static const fltSemantics &FloatTF32() { return semFloatTF32; }
+ static const fltSemantics &Float8E8M0FNU() { return semFloat8E8M0FNU; }
+ static const fltSemantics &Float6E3M2FN() { return semFloat6E3M2FN; }
+ static const fltSemantics &Float6E2M3FN() { return semFloat6E2M3FN; }
+ static const fltSemantics &Float4E2M1FN() { return semFloat4E2M1FN; }
+ static const fltSemantics &x87DoubleExtended() {
+ return semX87DoubleExtended;
+ }
/// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
/// anything real.
- LLVM_ABI static const fltSemantics &Bogus() LLVM_READNONE;
+ static const fltSemantics &Bogus() { return semBogus; }
// Returns true if any number described by this semantics can be precisely
// represented by the specified semantics. Does not take into account
@@ -927,69 +967,11 @@ class APFloat : public APFloatBase {
llvm_unreachable("Unexpected semantics");
}
- ~Storage() {
- if (usesLayout<IEEEFloat>(*semantics)) {
- IEEE.~IEEEFloat();
- return;
- }
- if (usesLayout<DoubleAPFloat>(*semantics)) {
- Double.~DoubleAPFloat();
- return;
- }
- llvm_unreachable("Unexpected semantics");
- }
-
- Storage(const Storage &RHS) {
- if (usesLayout<IEEEFloat>(*RHS.semantics)) {
- new (this) IEEEFloat(RHS.IEEE);
- return;
- }
- if (usesLayout<DoubleAPFloat>(*RHS.semantics)) {
- new (this) DoubleAPFloat(RHS.Double);
- return;
- }
- llvm_unreachable("Unexpected semantics");
- }
-
- Storage(Storage &&RHS) {
- if (usesLayout<IEEEFloat>(*RHS.semantics)) {
- new (this) IEEEFloat(std::move(RHS.IEEE));
- return;
- }
- if (usesLayout<DoubleAPFloat>(*RHS.semantics)) {
- new (this) DoubleAPFloat(std::move(RHS.Double));
- return;
- }
- llvm_unreachable("Unexpected semantics");
- }
-
- Storage &operator=(const Storage &RHS) {
- if (usesLayout<IEEEFloat>(*semantics) &&
- usesLayout<IEEEFloat>(*RHS.semantics)) {
- IEEE = RHS.IEEE;
- } else if (usesLayout<DoubleAPFloat>(*semantics) &&
- usesLayout<DoubleAPFloat>(*RHS.semantics)) {
- Double = RHS.Double;
- } else if (this != &RHS) {
- this->~Storage();
- new (this) Storage(RHS);
- }
- return *this;
- }
-
- Storage &operator=(Storage &&RHS) {
- if (usesLayout<IEEEFloat>(*semantics) &&
- usesLayout<IEEEFloat>(*RHS.semantics)) {
- IEEE = std::move(RHS.IEEE);
- } else if (usesLayout<DoubleAPFloat>(*semantics) &&
- usesLayout<DoubleAPFloat>(*RHS.semantics)) {
- Double = std::move(RHS.Double);
- } else if (this != &RHS) {
- this->~Storage();
- new (this) Storage(std::move(RHS));
- }
- return *this;
- }
+ LLVM_ABI ~Storage();
+ LLVM_ABI Storage(const Storage &RHS);
+ LLVM_ABI Storage(Storage &&RHS);
+ LLVM_ABI Storage &operator=(const Storage &RHS);
+ LLVM_ABI Storage &operator=(Storage &&RHS);
} U;
template <typename T> static bool usesLayout(const fltSemantics &Semantics) {
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 4bda50f..25b5262 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -42,7 +42,7 @@ namespace detail {
// We extend a pair to allow users to override the bucket type with their own
// implementation without requiring two members.
template <typename KeyT, typename ValueT>
-struct DenseMapPair : public std::pair<KeyT, ValueT> {
+struct DenseMapPair : std::pair<KeyT, ValueT> {
using std::pair<KeyT, ValueT>::pair;
KeyT &getFirst() { return std::pair<KeyT, ValueT>::first; }
diff --git a/llvm/include/llvm/ADT/DepthFirstIterator.h b/llvm/include/llvm/ADT/DepthFirstIterator.h
index 4ced758..3c54f32 100644
--- a/llvm/include/llvm/ADT/DepthFirstIterator.h
+++ b/llvm/include/llvm/ADT/DepthFirstIterator.h
@@ -66,8 +66,8 @@ public:
// one more method, completed, which is invoked when all children of a
// node have been processed. It is intended to distinguish of back and
// cross edges in the spanning tree but is not used in the common case.
-template <typename NodeRef, unsigned SmallSize=8>
-struct df_iterator_default_set : public SmallPtrSet<NodeRef, SmallSize> {
+template <typename NodeRef, unsigned SmallSize = 8>
+struct df_iterator_default_set : SmallPtrSet<NodeRef, SmallSize> {
using BaseSet = SmallPtrSet<NodeRef, SmallSize>;
using iterator = typename BaseSet::iterator;
@@ -235,8 +235,10 @@ iterator_range<df_iterator<T>> depth_first(const T& G) {
}
// Provide global definitions of external depth first iterators...
-template <class T, class SetTy = df_iterator_default_set<typename GraphTraits<T>::NodeRef>>
-struct df_ext_iterator : public df_iterator<T, SetTy, true> {
+template <class T,
+ class SetTy =
+ df_iterator_default_set<typename GraphTraits<T>::NodeRef>>
+struct df_ext_iterator : df_iterator<T, SetTy, true> {
df_ext_iterator(const df_iterator<T, SetTy, true> &V)
: df_iterator<T, SetTy, true>(V) {}
};
@@ -262,7 +264,7 @@ template <class T,
class SetTy =
df_iterator_default_set<typename GraphTraits<T>::NodeRef>,
bool External = false>
-struct idf_iterator : public df_iterator<Inverse<T>, SetTy, External> {
+struct idf_iterator : df_iterator<Inverse<T>, SetTy, External> {
idf_iterator(const df_iterator<Inverse<T>, SetTy, External> &V)
: df_iterator<Inverse<T>, SetTy, External>(V) {}
};
@@ -284,8 +286,10 @@ iterator_range<idf_iterator<T>> inverse_depth_first(const T& G) {
}
// Provide global definitions of external inverse depth first iterators...
-template <class T, class SetTy = df_iterator_default_set<typename GraphTraits<T>::NodeRef>>
-struct idf_ext_iterator : public idf_iterator<T, SetTy, true> {
+template <class T,
+ class SetTy =
+ df_iterator_default_set<typename GraphTraits<T>::NodeRef>>
+struct idf_ext_iterator : idf_iterator<T, SetTy, true> {
idf_ext_iterator(const idf_iterator<T, SetTy, true> &V)
: idf_iterator<T, SetTy, true>(V) {}
idf_ext_iterator(const df_iterator<Inverse<T>, SetTy, true> &V)
diff --git a/llvm/include/llvm/ADT/ImmutableSet.h b/llvm/include/llvm/ADT/ImmutableSet.h
index 310539f..8b2425e 100644
--- a/llvm/include/llvm/ADT/ImmutableSet.h
+++ b/llvm/include/llvm/ADT/ImmutableSet.h
@@ -931,8 +931,7 @@ struct ImutProfileInfo<T*> {
/// ImutContainerInfo - Generic definition of comparison operations for
/// elements of immutable containers that defaults to using
/// std::equal_to<> and std::less<> to perform comparison of elements.
-template <typename T>
-struct ImutContainerInfo : public ImutProfileInfo<T> {
+template <typename T> struct ImutContainerInfo : ImutProfileInfo<T> {
using value_type = typename ImutProfileInfo<T>::value_type;
using value_type_ref = typename ImutProfileInfo<T>::value_type_ref;
using key_type = value_type;
@@ -957,8 +956,7 @@ struct ImutContainerInfo : public ImutProfileInfo<T> {
/// ImutContainerInfo - Specialization for pointer values to treat pointers
/// as references to unique objects. Pointers are thus compared by
/// their addresses.
-template <typename T>
-struct ImutContainerInfo<T*> : public ImutProfileInfo<T*> {
+template <typename T> struct ImutContainerInfo<T *> : ImutProfileInfo<T *> {
using value_type = typename ImutProfileInfo<T*>::value_type;
using value_type_ref = typename ImutProfileInfo<T*>::value_type_ref;
using key_type = value_type;
diff --git a/llvm/include/llvm/ADT/PostOrderIterator.h b/llvm/include/llvm/ADT/PostOrderIterator.h
index 1cbd3c1..d9aa452 100644
--- a/llvm/include/llvm/ADT/PostOrderIterator.h
+++ b/llvm/include/llvm/ADT/PostOrderIterator.h
@@ -200,7 +200,7 @@ template <class T> iterator_range<po_iterator<T>> post_order(const T &G) {
// Provide global definitions of external postorder iterators...
template <class T, class SetType = std::set<typename GraphTraits<T>::NodeRef>>
-struct po_ext_iterator : public po_iterator<T, SetType, true> {
+struct po_ext_iterator : po_iterator<T, SetType, true> {
po_ext_iterator(const po_iterator<T, SetType, true> &V) :
po_iterator<T, SetType, true>(V) {}
};
@@ -223,7 +223,7 @@ iterator_range<po_ext_iterator<T, SetType>> post_order_ext(const T &G, SetType &
// Provide global definitions of inverse post order iterators...
template <class T, class SetType = std::set<typename GraphTraits<T>::NodeRef>,
bool External = false>
-struct ipo_iterator : public po_iterator<Inverse<T>, SetType, External> {
+struct ipo_iterator : po_iterator<Inverse<T>, SetType, External> {
ipo_iterator(const po_iterator<Inverse<T>, SetType, External> &V) :
po_iterator<Inverse<T>, SetType, External> (V) {}
};
@@ -245,7 +245,7 @@ iterator_range<ipo_iterator<T>> inverse_post_order(const T &G) {
// Provide global definitions of external inverse postorder iterators...
template <class T, class SetType = std::set<typename GraphTraits<T>::NodeRef>>
-struct ipo_ext_iterator : public ipo_iterator<T, SetType, true> {
+struct ipo_ext_iterator : ipo_iterator<T, SetType, true> {
ipo_ext_iterator(const ipo_iterator<T, SetType, true> &V) :
ipo_iterator<T, SetType, true>(V) {}
ipo_ext_iterator(const po_iterator<Inverse<T>, SetType, true> &V) :
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index 658f262..a9841c6 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -674,7 +674,7 @@ using zip_traits = iterator_facade_base<
ReferenceTupleType *, ReferenceTupleType>;
template <typename ZipType, typename ReferenceTupleType, typename... Iters>
-struct zip_common : public zip_traits<ZipType, ReferenceTupleType, Iters...> {
+struct zip_common : zip_traits<ZipType, ReferenceTupleType, Iters...> {
using Base = zip_traits<ZipType, ReferenceTupleType, Iters...>;
using IndexSequence = std::index_sequence_for<Iters...>;
using value_type = typename Base::value_type;
diff --git a/llvm/include/llvm/ADT/STLForwardCompat.h b/llvm/include/llvm/ADT/STLForwardCompat.h
index da9d3ab0..273a5cf 100644
--- a/llvm/include/llvm/ADT/STLForwardCompat.h
+++ b/llvm/include/llvm/ADT/STLForwardCompat.h
@@ -26,6 +26,54 @@ namespace llvm {
// Features from C++20
//===----------------------------------------------------------------------===//
+namespace numbers {
+// clang-format off
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T e_v = T(0x1.5bf0a8b145769P+1); // (2.7182818284590452354) https://oeis.org/A001113
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T egamma_v = T(0x1.2788cfc6fb619P-1); // (.57721566490153286061) https://oeis.org/A001620
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T ln2_v = T(0x1.62e42fefa39efP-1); // (.69314718055994530942) https://oeis.org/A002162
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T ln10_v = T(0x1.26bb1bbb55516P+1); // (2.3025850929940456840) https://oeis.org/A002392
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T log2e_v = T(0x1.71547652b82feP+0); // (1.4426950408889634074)
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T log10e_v = T(0x1.bcb7b1526e50eP-2); // (.43429448190325182765)
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T pi_v = T(0x1.921fb54442d18P+1); // (3.1415926535897932385) https://oeis.org/A000796
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T inv_pi_v = T(0x1.45f306dc9c883P-2); // (.31830988618379067154) https://oeis.org/A049541
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T inv_sqrtpi_v = T(0x1.20dd750429b6dP-1); // (.56418958354775628695) https://oeis.org/A087197
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T sqrt2_v = T(0x1.6a09e667f3bcdP+0); // (1.4142135623730950488) https://oeis.org/A00219
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T inv_sqrt2_v = T(0x1.6a09e667f3bcdP-1); // (.70710678118654752440)
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T sqrt3_v = T(0x1.bb67ae8584caaP+0); // (1.7320508075688772935) https://oeis.org/A002194
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T inv_sqrt3_v = T(0x1.279a74590331cP-1); // (.57735026918962576451)
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T phi_v = T(0x1.9e3779b97f4a8P+0); // (1.6180339887498948482) https://oeis.org/A001622
+
+inline constexpr double e = e_v<double>;
+inline constexpr double egamma = egamma_v<double>;
+inline constexpr double ln2 = ln2_v<double>;
+inline constexpr double ln10 = ln10_v<double>;
+inline constexpr double log2e = log2e_v<double>;
+inline constexpr double log10e = log10e_v<double>;
+inline constexpr double pi = pi_v<double>;
+inline constexpr double inv_pi = inv_pi_v<double>;
+inline constexpr double inv_sqrtpi = inv_sqrtpi_v<double>;
+inline constexpr double sqrt2 = sqrt2_v<double>;
+inline constexpr double inv_sqrt2 = inv_sqrt2_v<double>;
+inline constexpr double sqrt3 = sqrt3_v<double>;
+inline constexpr double inv_sqrt3 = inv_sqrt3_v<double>;
+inline constexpr double phi = phi_v<double>;
+// clang-format on
+} // namespace numbers
+
template <typename T>
struct remove_cvref // NOLINT(readability-identifier-naming)
{
diff --git a/llvm/include/llvm/ADT/SmallPtrSet.h b/llvm/include/llvm/ADT/SmallPtrSet.h
index f588a77..8e7c8b3 100644
--- a/llvm/include/llvm/ADT/SmallPtrSet.h
+++ b/llvm/include/llvm/ADT/SmallPtrSet.h
@@ -532,18 +532,8 @@ class SmallPtrSet : public SmallPtrSetImpl<PtrType> {
using BaseT = SmallPtrSetImpl<PtrType>;
- // A constexpr version of llvm::bit_ceil.
- // TODO: Replace this with std::bit_ceil once C++20 is available.
- static constexpr size_t RoundUpToPowerOfTwo(size_t X) {
- size_t C = 1;
- size_t CMax = C << (std::numeric_limits<size_t>::digits - 1);
- while (C < X && C < CMax)
- C <<= 1;
- return C;
- }
-
// Make sure that SmallSize is a power of two, round up if not.
- static constexpr size_t SmallSizePowTwo = RoundUpToPowerOfTwo(SmallSize);
+ static constexpr size_t SmallSizePowTwo = llvm::bit_ceil_constexpr(SmallSize);
/// SmallStorage - Fixed size storage used in 'small mode'.
const void *SmallStorage[SmallSizePowTwo];
diff --git a/llvm/include/llvm/ADT/bit.h b/llvm/include/llvm/ADT/bit.h
index 66c4f94..8b60b69 100644
--- a/llvm/include/llvm/ADT/bit.h
+++ b/llvm/include/llvm/ADT/bit.h
@@ -336,34 +336,44 @@ template <typename T> [[nodiscard]] T bit_ceil(T Value) {
return T(1) << llvm::bit_width<T>(Value - 1u);
}
-// Forward-declare rotr so that rotl can use it.
-template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
-[[nodiscard]] constexpr T rotr(T V, int R);
+/// Returns the smallest integral power of two no smaller than Value if Value is
+/// nonzero. Returns 1 otherwise.
+///
+/// Ex. bit_ceil(5) == 8.
+///
+/// The return value is undefined if the input is larger than the largest power
+/// of two representable in T.
+template <typename T> [[nodiscard]] constexpr T bit_ceil_constexpr(T Value) {
+ static_assert(std::is_unsigned_v<T>,
+ "Only unsigned integral types are allowed.");
+ if (Value < 2)
+ return 1;
+ return T(1) << llvm::bit_width_constexpr<T>(Value - 1u);
+}
template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
[[nodiscard]] constexpr T rotl(T V, int R) {
- unsigned N = std::numeric_limits<T>::digits;
+ constexpr unsigned N = std::numeric_limits<T>::digits;
- R = R % N;
- if (!R)
- return V;
+ static_assert(has_single_bit(N), "& (N - 1) is only valid for powers of two");
+ R = R & (N - 1);
- if (R < 0)
- return llvm::rotr(V, -R);
+ if (R == 0)
+ return V;
return (V << R) | (V >> (N - R));
}
-template <typename T, typename> [[nodiscard]] constexpr T rotr(T V, int R) {
- unsigned N = std::numeric_limits<T>::digits;
+template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
+[[nodiscard]] constexpr T rotr(T V, int R) {
+ constexpr unsigned N = std::numeric_limits<T>::digits;
+
+ static_assert(has_single_bit(N), "& (N - 1) is only valid for powers of two");
+ R = R & (N - 1);
- R = R % N;
- if (!R)
+ if (R == 0)
return V;
- if (R < 0)
- return llvm::rotl(V, -R);
-
return (V >> R) | (V << (N - R));
}
diff --git a/llvm/include/llvm/Analysis/LoopAnalysisManager.h b/llvm/include/llvm/Analysis/LoopAnalysisManager.h
index fc69cb0..1755257 100644
--- a/llvm/include/llvm/Analysis/LoopAnalysisManager.h
+++ b/llvm/include/llvm/Analysis/LoopAnalysisManager.h
@@ -36,7 +36,6 @@ namespace llvm {
class AAResults;
class AssumptionCache;
-class BlockFrequencyInfo;
class DominatorTree;
class Function;
class Loop;
@@ -58,7 +57,6 @@ struct LoopStandardAnalysisResults {
ScalarEvolution &SE;
TargetLibraryInfo &TLI;
TargetTransformInfo &TTI;
- BlockFrequencyInfo *BFI;
MemorySSA *MSSA;
};
diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
index a7a6a27..0ecb114 100644
--- a/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/llvm/include/llvm/Analysis/LoopInfo.h
@@ -617,7 +617,7 @@ public:
};
/// Function to print a loop's contents as LLVM's text IR assembly.
-LLVM_ABI void printLoop(Loop &L, raw_ostream &OS,
+LLVM_ABI void printLoop(const Loop &L, raw_ostream &OS,
const std::string &Banner = "");
/// Find and return the loop attribute node for the attribute @p Name in
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index e5a6c8c..3d3ec14 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1345,6 +1345,7 @@ public:
class LoopGuards {
DenseMap<const SCEV *, const SCEV *> RewriteMap;
+ SmallDenseSet<std::pair<const SCEV *, const SCEV *>> NotEqual;
bool PreserveNUW = false;
bool PreserveNSW = false;
ScalarEvolution &SE;
diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
index 68198ec..9354eef 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
@@ -256,6 +256,18 @@ m_scev_UDiv(const Op0_t &Op0, const Op1_t &Op1) {
return m_scev_Binary<SCEVUDivExpr>(Op0, Op1);
}
+template <typename Op0_t, typename Op1_t>
+inline SCEVBinaryExpr_match<SCEVSMaxExpr, Op0_t, Op1_t>
+m_scev_SMax(const Op0_t &Op0, const Op1_t &Op1) {
+ return m_scev_Binary<SCEVSMaxExpr>(Op0, Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline SCEVBinaryExpr_match<SCEVMinMaxExpr, Op0_t, Op1_t>
+m_scev_MinMax(const Op0_t &Op0, const Op1_t &Op1) {
+ return m_scev_Binary<SCEVMinMaxExpr>(Op0, Op1);
+}
+
/// Match unsigned remainder pattern.
/// Matches patterns generated by getURemExpr.
template <typename Op0_t, typename Op1_t> struct SCEVURem_match {
diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index e0292c2..457c60e3b 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -14,7 +14,7 @@
// dbg.value(metadata i32 %foo, ...)
// %bar = void call @ext(%foo);
//
-// and all information is stored in the Value / Metadata hierachy defined
+// and all information is stored in the Value / Metadata hierarchy defined
// elsewhere in LLVM. In the "DbgRecord" design, each instruction /may/ have a
// connection with a DbgMarker, which identifies a position immediately before
// the instruction, and each DbgMarker /may/ then have connections to DbgRecords
@@ -37,7 +37,7 @@
//
// This structure separates the two concerns of the position of the debug-info
// in the function, and the Value that it refers to. It also creates a new
-// "place" in-between the Value / Metadata hierachy where we can customise
+// "place" in-between the Value / Metadata hierarchy where we can customise
// storage and allocation techniques to better suite debug-info workloads.
// NB: as of the initial prototype, none of that has actually been attempted
// yet.
@@ -162,7 +162,7 @@ public:
LLVM_ABI bool isIdenticalToWhenDefined(const DbgRecord &R) const;
/// Convert this DbgRecord back into an appropriate llvm.dbg.* intrinsic.
/// \p InsertBefore Optional position to insert this intrinsic.
- /// \returns A new llvm.dbg.* intrinsic representiung this DbgRecord.
+ /// \returns A new llvm.dbg.* intrinsic representing this DbgRecord.
LLVM_ABI DbgInfoIntrinsic *
createDebugIntrinsic(Module *M, Instruction *InsertBefore) const;
///@}
@@ -530,7 +530,7 @@ public:
LLVM_ABI void setKillAddress();
/// Check whether this kills the address component. This doesn't take into
/// account the position of the intrinsic, therefore a returned value of false
- /// does not guarentee the address is a valid location for the variable at the
+ /// does not guarantee the address is a valid location for the variable at the
/// intrinsic's position in IR.
LLVM_ABI bool isKillAddress() const;
@@ -539,7 +539,7 @@ public:
LLVM_ABI DbgVariableRecord *clone() const;
/// Convert this DbgVariableRecord back into a dbg.value intrinsic.
/// \p InsertBefore Optional position to insert this intrinsic.
- /// \returns A new dbg.value intrinsic representiung this DbgVariableRecord.
+ /// \returns A new dbg.value intrinsic representing this DbgVariableRecord.
LLVM_ABI DbgVariableIntrinsic *
createDebugIntrinsic(Module *M, Instruction *InsertBefore) const;
diff --git a/llvm/include/llvm/IR/Value.h b/llvm/include/llvm/IR/Value.h
index 04d0391..58822a0 100644
--- a/llvm/include/llvm/IR/Value.h
+++ b/llvm/include/llvm/IR/Value.h
@@ -484,8 +484,8 @@ public:
/// Remove every uses that can safely be removed.
///
/// This will remove for example uses in llvm.assume.
- /// This should be used when performing want to perform a tranformation but
- /// some Droppable uses pervent it.
+ /// This should be used when performing want to perform a transformation but
+ /// some Droppable uses prevent it.
/// This function optionally takes a filter to only remove some droppable
/// uses.
LLVM_ABI void
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 3a9a7f7..000472f 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -105,12 +105,6 @@ setupStatsFile(StringRef StatsFilename);
/// ordered indices to elements in the input array.
LLVM_ABI std::vector<int> generateModulesOrdering(ArrayRef<BitcodeModule *> R);
-/// Updates MemProf attributes (and metadata) based on whether the index
-/// has recorded that we are linking with allocation libraries containing
-/// the necessary APIs for downstream transformations.
-LLVM_ABI void updateMemProfAttributes(Module &Mod,
- const ModuleSummaryIndex &Index);
-
class LTO;
struct SymbolResolution;
diff --git a/llvm/include/llvm/Support/Alignment.h b/llvm/include/llvm/Support/Alignment.h
index a4ca54e..f9d7c76 100644
--- a/llvm/include/llvm/Support/Alignment.h
+++ b/llvm/include/llvm/Support/Alignment.h
@@ -103,7 +103,7 @@ inline Align assumeAligned(uint64_t Value) {
/// This struct is a compact representation of a valid (power of two) or
/// undefined (0) alignment.
-struct MaybeAlign : public std::optional<Align> {
+struct MaybeAlign : std::optional<Align> {
private:
using UP = std::optional<Align>;
diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h
index 2a9a149..6f6df2e 100644
--- a/llvm/include/llvm/Support/Casting.h
+++ b/llvm/include/llvm/Support/Casting.h
@@ -340,7 +340,7 @@ struct ValueFromPointerCast
/// during the cast. It's also a good example of how to implement a move-only
/// cast.
template <typename To, typename From, typename Derived = void>
-struct UniquePtrCast : public CastIsPossible<To, From *> {
+struct UniquePtrCast : CastIsPossible<To, From *> {
using Self = detail::SelfType<Derived, UniquePtrCast<To, From>>;
using CastResultType = std::unique_ptr<
std::remove_reference_t<typename cast_retty<To, From>::ret_type>>;
@@ -473,7 +473,7 @@ struct ForwardToPointerCast {
// take advantage of the cast traits whenever possible!
template <typename To, typename From, typename Enable = void>
-struct CastInfo : public CastIsPossible<To, From> {
+struct CastInfo : CastIsPossible<To, From> {
using Self = CastInfo<To, From, Enable>;
using CastReturnType = typename cast_retty<To, From>::ret_type;
@@ -536,8 +536,7 @@ struct CastInfo<To, std::unique_ptr<From>> : public UniquePtrCast<To, From> {};
/// the input is std::optional<From> that the output can be std::optional<To>.
/// If that's not the case, specialize CastInfo for your use case.
template <typename To, typename From>
-struct CastInfo<To, std::optional<From>> : public OptionalValueCast<To, From> {
-};
+struct CastInfo<To, std::optional<From>> : OptionalValueCast<To, From> {};
/// isa<X> - Return true if the parameter to the template is an instance of one
/// of the template type arguments. Used like this:
diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h
index dd05c53..5a5f00e 100644
--- a/llvm/include/llvm/Support/CommandLine.h
+++ b/llvm/include/llvm/Support/CommandLine.h
@@ -549,7 +549,7 @@ template <class DataType> struct OptionValue;
// The default value safely does nothing. Option value printing is only
// best-effort.
template <class DataType, bool isClass>
-struct OptionValueBase : public GenericOptionValue {
+struct OptionValueBase : GenericOptionValue {
// Temporary storage for argument passing.
using WrapperType = OptionValue<DataType>;
diff --git a/llvm/include/llvm/Support/DOTGraphTraits.h b/llvm/include/llvm/Support/DOTGraphTraits.h
index ffa9abe..3b9fe00 100644
--- a/llvm/include/llvm/Support/DOTGraphTraits.h
+++ b/llvm/include/llvm/Support/DOTGraphTraits.h
@@ -162,9 +162,8 @@ public:
/// graphs are converted to 'dot' graphs. When specializing, you may inherit
/// from DefaultDOTGraphTraits if you don't need to override everything.
///
-template <typename Ty>
-struct DOTGraphTraits : public DefaultDOTGraphTraits {
- DOTGraphTraits (bool simple=false) : DefaultDOTGraphTraits (simple) {}
+template <typename Ty> struct DOTGraphTraits : DefaultDOTGraphTraits {
+ using DefaultDOTGraphTraits::DefaultDOTGraphTraits;
};
} // End llvm namespace
diff --git a/llvm/include/llvm/Support/ELFAttributes.h b/llvm/include/llvm/Support/ELFAttributes.h
index 270246f..5771a84 100644
--- a/llvm/include/llvm/Support/ELFAttributes.h
+++ b/llvm/include/llvm/Support/ELFAttributes.h
@@ -48,8 +48,6 @@ struct SubsectionAndTagToTagName {
StringRef SubsectionName;
unsigned Tag;
StringRef TagName;
- SubsectionAndTagToTagName(StringRef SN, unsigned Tg, StringRef TN)
- : SubsectionName(SN), Tag(Tg), TagName(TN) {}
};
namespace ELFAttrs {
diff --git a/llvm/include/llvm/Support/LSP/Protocol.h b/llvm/include/llvm/Support/LSP/Protocol.h
index 93b82f1..e38203a 100644
--- a/llvm/include/llvm/Support/LSP/Protocol.h
+++ b/llvm/include/llvm/Support/LSP/Protocol.h
@@ -449,7 +449,7 @@ struct ReferenceContext {
bool fromJSON(const llvm::json::Value &value, ReferenceContext &result,
llvm::json::Path path);
-struct ReferenceParams : public TextDocumentPositionParams {
+struct ReferenceParams : TextDocumentPositionParams {
ReferenceContext context;
};
diff --git a/llvm/include/llvm/Support/MD5.h b/llvm/include/llvm/Support/MD5.h
index ed29826..4ba3867 100644
--- a/llvm/include/llvm/Support/MD5.h
+++ b/llvm/include/llvm/Support/MD5.h
@@ -41,7 +41,7 @@ template <typename T> class ArrayRef;
class MD5 {
public:
- struct MD5Result : public std::array<uint8_t, 16> {
+ struct MD5Result : std::array<uint8_t, 16> {
LLVM_ABI SmallString<32> digest() const;
uint64_t low() const {
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index c2716a9..41232335 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -13,6 +13,7 @@
#ifndef LLVM_SUPPORT_MATHEXTRAS_H
#define LLVM_SUPPORT_MATHEXTRAS_H
+#include "llvm/ADT/STLForwardCompat.h"
#include "llvm/ADT/bit.h"
#include "llvm/Support/Compiler.h"
#include <cassert>
@@ -42,38 +43,28 @@ using common_sint =
/// Mathematical constants.
namespace numbers {
-// TODO: Track C++20 std::numbers.
// clang-format off
-constexpr double e = 0x1.5bf0a8b145769P+1, // (2.7182818284590452354) https://oeis.org/A001113
- egamma = 0x1.2788cfc6fb619P-1, // (.57721566490153286061) https://oeis.org/A001620
- ln2 = 0x1.62e42fefa39efP-1, // (.69314718055994530942) https://oeis.org/A002162
- ln10 = 0x1.26bb1bbb55516P+1, // (2.3025850929940456840) https://oeis.org/A002392
- log2e = 0x1.71547652b82feP+0, // (1.4426950408889634074)
- log10e = 0x1.bcb7b1526e50eP-2, // (.43429448190325182765)
- pi = 0x1.921fb54442d18P+1, // (3.1415926535897932385) https://oeis.org/A000796
- inv_pi = 0x1.45f306dc9c883P-2, // (.31830988618379067154) https://oeis.org/A049541
- sqrtpi = 0x1.c5bf891b4ef6bP+0, // (1.7724538509055160273) https://oeis.org/A002161
- inv_sqrtpi = 0x1.20dd750429b6dP-1, // (.56418958354775628695) https://oeis.org/A087197
- sqrt2 = 0x1.6a09e667f3bcdP+0, // (1.4142135623730950488) https://oeis.org/A00219
- inv_sqrt2 = 0x1.6a09e667f3bcdP-1, // (.70710678118654752440)
- sqrt3 = 0x1.bb67ae8584caaP+0, // (1.7320508075688772935) https://oeis.org/A002194
- inv_sqrt3 = 0x1.279a74590331cP-1, // (.57735026918962576451)
- phi = 0x1.9e3779b97f4a8P+0; // (1.6180339887498948482) https://oeis.org/A001622
-constexpr float ef = 0x1.5bf0a8P+1F, // (2.71828183) https://oeis.org/A001113
- egammaf = 0x1.2788d0P-1F, // (.577215665) https://oeis.org/A001620
- ln2f = 0x1.62e430P-1F, // (.693147181) https://oeis.org/A002162
- ln10f = 0x1.26bb1cP+1F, // (2.30258509) https://oeis.org/A002392
- log2ef = 0x1.715476P+0F, // (1.44269504)
- log10ef = 0x1.bcb7b2P-2F, // (.434294482)
- pif = 0x1.921fb6P+1F, // (3.14159265) https://oeis.org/A000796
- inv_pif = 0x1.45f306P-2F, // (.318309886) https://oeis.org/A049541
- sqrtpif = 0x1.c5bf8aP+0F, // (1.77245385) https://oeis.org/A002161
- inv_sqrtpif = 0x1.20dd76P-1F, // (.564189584) https://oeis.org/A087197
- sqrt2f = 0x1.6a09e6P+0F, // (1.41421356) https://oeis.org/A002193
- inv_sqrt2f = 0x1.6a09e6P-1F, // (.707106781)
- sqrt3f = 0x1.bb67aeP+0F, // (1.73205081) https://oeis.org/A002194
- inv_sqrt3f = 0x1.279a74P-1F, // (.577350269)
- phif = 0x1.9e377aP+0F; // (1.61803399) https://oeis.org/A001622
+inline constexpr float ef = e_v<float>;
+inline constexpr float egammaf = egamma_v<float>;
+inline constexpr float ln2f = ln2_v<float>;
+inline constexpr float ln10f = ln10_v<float>;
+inline constexpr float log2ef = log2e_v<float>;
+inline constexpr float log10ef = log10e_v<float>;
+inline constexpr float pif = pi_v<float>;
+inline constexpr float inv_pif = inv_pi_v<float>;
+inline constexpr float inv_sqrtpif = inv_sqrtpi_v<float>;
+inline constexpr float sqrt2f = sqrt2_v<float>;
+inline constexpr float inv_sqrt2f = inv_sqrt2_v<float>;
+inline constexpr float sqrt3f = sqrt3_v<float>;
+inline constexpr float inv_sqrt3f = inv_sqrt3_v<float>;
+inline constexpr float phif = phi_v<float>;
+
+// sqrtpi is not in C++20 std::numbers.
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T sqrtpi_v = T(0x1.c5bf891b4ef6bP+0); // (1.7724538509055160273) https://oeis.org/A002161
+inline constexpr double sqrtpi = sqrtpi_v<double>;
+inline constexpr float sqrtpif = sqrtpi_v<float>;
+
// These string literals are taken from below:
// https://github.com/bminor/glibc/blob/8543577b04ded6d979ffcc5a818930e4d74d0645/math/math.h#L1215-L1229
constexpr const char *pis = "3.141592653589793238462643383279502884",
diff --git a/llvm/include/llvm/Support/Timer.h b/llvm/include/llvm/Support/Timer.h
index 40709d4..a4ed712 100644
--- a/llvm/include/llvm/Support/Timer.h
+++ b/llvm/include/llvm/Support/Timer.h
@@ -167,7 +167,7 @@ public:
/// you to declare a new timer, AND specify the region to time, all in one
/// statement. All timers with the same name are merged. This is primarily
/// used for debugging and for hunting performance problems.
-struct NamedRegionTimer : public TimeRegion {
+struct NamedRegionTimer : TimeRegion {
LLVM_ABI explicit NamedRegionTimer(StringRef Name, StringRef Description,
StringRef GroupName,
StringRef GroupDescription,
diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
index faaff4a..4aa6c01 100644
--- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
+++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -121,6 +121,7 @@ enum attributeBits {
"The Dynamic Duo! Prefer over all else because this changes " \
"most operands' meaning") \
ENUM_ENTRY(IC_64BIT_REX2, 2, "requires a REX2 prefix") \
+ ENUM_ENTRY(IC_64BIT_REX2_REXW, 3, "requires a REX2 and the W prefix") \
ENUM_ENTRY(IC_VEX, 1, "requires a VEX prefix") \
ENUM_ENTRY(IC_VEX_XS, 2, "requires VEX and the XS prefix") \
ENUM_ENTRY(IC_VEX_XD, 2, "requires VEX and the XD prefix") \
diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
index f2de083..576f1eb 100644
--- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
+++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
@@ -95,6 +95,16 @@ public:
function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
isPrevailing);
};
+
+/// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
+/// when we don't have an index that has recorded that we are linking with
+/// allocation libraries containing the necessary APIs for downstream
+/// transformations.
+class MemProfRemoveInfo : public PassInfoMixin<MemProfRemoveInfo> {
+public:
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
} // end namespace llvm
#endif // LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h b/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
index 750f954..1842d2d 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
@@ -404,10 +404,8 @@ public:
explicit FunctionToLoopPassAdaptor(std::unique_ptr<PassConceptT> Pass,
bool UseMemorySSA = false,
- bool UseBlockFrequencyInfo = false,
bool LoopNestMode = false)
: Pass(std::move(Pass)), UseMemorySSA(UseMemorySSA),
- UseBlockFrequencyInfo(UseBlockFrequencyInfo),
LoopNestMode(LoopNestMode) {
LoopCanonicalizationFPM.addPass(LoopSimplifyPass());
LoopCanonicalizationFPM.addPass(LCSSAPass());
@@ -429,7 +427,6 @@ private:
FunctionPassManager LoopCanonicalizationFPM;
bool UseMemorySSA = false;
- bool UseBlockFrequencyInfo = false;
const bool LoopNestMode;
};
@@ -442,8 +439,7 @@ private:
/// \c LoopPassManager and the returned adaptor will be in loop-nest mode.
template <typename LoopPassT>
inline FunctionToLoopPassAdaptor
-createFunctionToLoopPassAdaptor(LoopPassT &&Pass, bool UseMemorySSA = false,
- bool UseBlockFrequencyInfo = false) {
+createFunctionToLoopPassAdaptor(LoopPassT &&Pass, bool UseMemorySSA = false) {
if constexpr (is_detected<HasRunOnLoopT, LoopPassT>::value) {
using PassModelT =
detail::PassModel<Loop, LoopPassT, LoopAnalysisManager,
@@ -453,7 +449,7 @@ createFunctionToLoopPassAdaptor(LoopPassT &&Pass, bool UseMemorySSA = false,
return FunctionToLoopPassAdaptor(
std::unique_ptr<FunctionToLoopPassAdaptor::PassConceptT>(
new PassModelT(std::forward<LoopPassT>(Pass))),
- UseMemorySSA, UseBlockFrequencyInfo, false);
+ UseMemorySSA, false);
} else {
LoopPassManager LPM;
LPM.addPass(std::forward<LoopPassT>(Pass));
@@ -465,7 +461,7 @@ createFunctionToLoopPassAdaptor(LoopPassT &&Pass, bool UseMemorySSA = false,
return FunctionToLoopPassAdaptor(
std::unique_ptr<FunctionToLoopPassAdaptor::PassConceptT>(
new PassModelT(std::move(LPM))),
- UseMemorySSA, UseBlockFrequencyInfo, true);
+ UseMemorySSA, true);
}
}
@@ -474,8 +470,7 @@ createFunctionToLoopPassAdaptor(LoopPassT &&Pass, bool UseMemorySSA = false,
template <>
inline FunctionToLoopPassAdaptor
createFunctionToLoopPassAdaptor<LoopPassManager>(LoopPassManager &&LPM,
- bool UseMemorySSA,
- bool UseBlockFrequencyInfo) {
+ bool UseMemorySSA) {
// Check if LPM contains any loop pass and if it does not, returns an adaptor
// in loop-nest mode.
using PassModelT =
@@ -487,7 +482,7 @@ createFunctionToLoopPassAdaptor<LoopPassManager>(LoopPassManager &&LPM,
return FunctionToLoopPassAdaptor(
std::unique_ptr<FunctionToLoopPassAdaptor::PassConceptT>(
new PassModelT(std::move(LPM))),
- UseMemorySSA, UseBlockFrequencyInfo, LoopNestMode);
+ UseMemorySSA, LoopNestMode);
}
/// Pass for printing a loop's contents as textual IR.
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 45c889c..a5ba197 100755
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2177,16 +2177,13 @@ Constant *constantFoldVectorReduce(Intrinsic::ID IID, Constant *Op) {
return PoisonValue::get(VT->getElementType());
// TODO: Handle undef.
- if (!isa<ConstantVector>(Op) && !isa<ConstantDataVector>(Op))
- return nullptr;
-
- auto *EltC = dyn_cast<ConstantInt>(Op->getAggregateElement(0U));
+ auto *EltC = dyn_cast_or_null<ConstantInt>(Op->getAggregateElement(0U));
if (!EltC)
return nullptr;
APInt Acc = EltC->getValue();
for (unsigned I = 1, E = VT->getNumElements(); I != E; I++) {
- if (!(EltC = dyn_cast<ConstantInt>(Op->getAggregateElement(I))))
+ if (!(EltC = dyn_cast_or_null<ConstantInt>(Op->getAggregateElement(I))))
return nullptr;
const APInt &X = EltC->getValue();
switch (IID) {
@@ -3059,35 +3056,25 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
Val = Val | Val << 1;
return ConstantInt::get(Ty, Val);
}
-
- default:
- return nullptr;
}
}
- switch (IntrinsicID) {
- default: break;
- case Intrinsic::vector_reduce_add:
- case Intrinsic::vector_reduce_mul:
- case Intrinsic::vector_reduce_and:
- case Intrinsic::vector_reduce_or:
- case Intrinsic::vector_reduce_xor:
- case Intrinsic::vector_reduce_smin:
- case Intrinsic::vector_reduce_smax:
- case Intrinsic::vector_reduce_umin:
- case Intrinsic::vector_reduce_umax:
- if (Constant *C = constantFoldVectorReduce(IntrinsicID, Operands[0]))
- return C;
- break;
- }
-
- // Support ConstantVector in case we have an Undef in the top.
- if (isa<ConstantVector>(Operands[0]) ||
- isa<ConstantDataVector>(Operands[0]) ||
- isa<ConstantAggregateZero>(Operands[0])) {
+ if (Operands[0]->getType()->isVectorTy()) {
auto *Op = cast<Constant>(Operands[0]);
switch (IntrinsicID) {
default: break;
+ case Intrinsic::vector_reduce_add:
+ case Intrinsic::vector_reduce_mul:
+ case Intrinsic::vector_reduce_and:
+ case Intrinsic::vector_reduce_or:
+ case Intrinsic::vector_reduce_xor:
+ case Intrinsic::vector_reduce_smin:
+ case Intrinsic::vector_reduce_smax:
+ case Intrinsic::vector_reduce_umin:
+ case Intrinsic::vector_reduce_umax:
+ if (Constant *C = constantFoldVectorReduce(IntrinsicID, Operands[0]))
+ return C;
+ break;
case Intrinsic::x86_sse_cvtss2si:
case Intrinsic::x86_sse_cvtss2si64:
case Intrinsic::x86_sse2_cvtsd2si:
@@ -3116,10 +3103,15 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
case Intrinsic::wasm_alltrue:
// Check each element individually
unsigned E = cast<FixedVectorType>(Op->getType())->getNumElements();
- for (unsigned I = 0; I != E; ++I)
- if (Constant *Elt = Op->getAggregateElement(I))
- if (Elt->isZeroValue())
- return ConstantInt::get(Ty, 0);
+ for (unsigned I = 0; I != E; ++I) {
+ Constant *Elt = Op->getAggregateElement(I);
+ // Return false as soon as we find a non-true element.
+ if (Elt && Elt->isZeroValue())
+ return ConstantInt::get(Ty, 0);
+ // Bail as soon as we find an element we cannot prove to be true.
+ if (!Elt || !isa<ConstantInt>(Elt))
+ return nullptr;
+ }
return ConstantInt::get(Ty, 1);
}
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 0e5bc48..df75999 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -947,9 +947,8 @@ LazyValueInfoImpl::solveBlockValueSelect(SelectInst *SI, BasicBlock *BB) {
/*UseBlockValue*/ false));
}
- ValueLatticeElement Result = TrueVal;
- Result.mergeIn(FalseVal);
- return Result;
+ TrueVal.mergeIn(FalseVal);
+ return TrueVal;
}
std::optional<ConstantRange>
@@ -1778,9 +1777,8 @@ ValueLatticeElement LazyValueInfoImpl::getValueInBlock(Value *V, BasicBlock *BB,
assert(OptResult && "Value not available after solving");
}
- ValueLatticeElement Result = *OptResult;
- LLVM_DEBUG(dbgs() << " Result = " << Result << "\n");
- return Result;
+ LLVM_DEBUG(dbgs() << " Result = " << *OptResult << "\n");
+ return *OptResult;
}
ValueLatticeElement LazyValueInfoImpl::getValueAt(Value *V, Instruction *CxtI) {
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index a8c3173..d84721b 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -986,8 +986,8 @@ PreservedAnalyses LoopPrinterPass::run(Function &F,
return PreservedAnalyses::all();
}
-void llvm::printLoop(Loop &L, raw_ostream &OS, const std::string &Banner) {
-
+void llvm::printLoop(const Loop &L, raw_ostream &OS,
+ const std::string &Banner) {
if (forcePrintModuleIR()) {
// handling -print-module-scope
OS << Banner << " (loop: ";
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 442b9d1..425420f 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -1840,19 +1840,19 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
// = zext((2^K * (trunc X to i{N-K}))<nuw>) to iM
// = (2^K * (zext(trunc X to i{N-K}) to iM))<nuw>.
//
- if (SM->getNumOperands() == 2)
- if (auto *MulLHS = dyn_cast<SCEVConstant>(SM->getOperand(0)))
- if (MulLHS->getAPInt().isPowerOf2())
- if (auto *TruncRHS = dyn_cast<SCEVTruncateExpr>(SM->getOperand(1))) {
- int NewTruncBits = getTypeSizeInBits(TruncRHS->getType()) -
- MulLHS->getAPInt().logBase2();
- Type *NewTruncTy = IntegerType::get(getContext(), NewTruncBits);
- return getMulExpr(
- getZeroExtendExpr(MulLHS, Ty),
- getZeroExtendExpr(
- getTruncateExpr(TruncRHS->getOperand(), NewTruncTy), Ty),
- SCEV::FlagNUW, Depth + 1);
- }
+ const APInt *C;
+ const SCEV *TruncRHS;
+ if (match(SM,
+ m_scev_Mul(m_scev_APInt(C), m_scev_Trunc(m_SCEV(TruncRHS)))) &&
+ C->isPowerOf2()) {
+ int NewTruncBits =
+ getTypeSizeInBits(SM->getOperand(1)->getType()) - C->logBase2();
+ Type *NewTruncTy = IntegerType::get(getContext(), NewTruncBits);
+ return getMulExpr(
+ getZeroExtendExpr(SM->getOperand(0), Ty),
+ getZeroExtendExpr(getTruncateExpr(TruncRHS, NewTruncTy), Ty),
+ SCEV::FlagNUW, Depth + 1);
+ }
}
// zext(umin(x, y)) -> umin(zext(x), zext(y))
@@ -3144,20 +3144,19 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
if (Ops.size() == 2) {
// C1*(C2+V) -> C1*C2 + C1*V
- if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1]))
- // If any of Add's ops are Adds or Muls with a constant, apply this
- // transformation as well.
- //
- // TODO: There are some cases where this transformation is not
- // profitable; for example, Add = (C0 + X) * Y + Z. Maybe the scope of
- // this transformation should be narrowed down.
- if (Add->getNumOperands() == 2 && containsConstantInAddMulChain(Add)) {
- const SCEV *LHS = getMulExpr(LHSC, Add->getOperand(0),
- SCEV::FlagAnyWrap, Depth + 1);
- const SCEV *RHS = getMulExpr(LHSC, Add->getOperand(1),
- SCEV::FlagAnyWrap, Depth + 1);
- return getAddExpr(LHS, RHS, SCEV::FlagAnyWrap, Depth + 1);
- }
+ // If any of Add's ops are Adds or Muls with a constant, apply this
+ // transformation as well.
+ //
+ // TODO: There are some cases where this transformation is not
+ // profitable; for example, Add = (C0 + X) * Y + Z. Maybe the scope of
+ // this transformation should be narrowed down.
+ const SCEV *Op0, *Op1;
+ if (match(Ops[1], m_scev_Add(m_SCEV(Op0), m_SCEV(Op1))) &&
+ containsConstantInAddMulChain(Ops[1])) {
+ const SCEV *LHS = getMulExpr(LHSC, Op0, SCEV::FlagAnyWrap, Depth + 1);
+ const SCEV *RHS = getMulExpr(LHSC, Op1, SCEV::FlagAnyWrap, Depth + 1);
+ return getAddExpr(LHS, RHS, SCEV::FlagAnyWrap, Depth + 1);
+ }
if (Ops[0]->isAllOnesValue()) {
// If we have a mul by -1 of an add, try distributing the -1 among the
@@ -3578,20 +3577,12 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
}
// ((-C + (C smax %x)) /u %x) evaluates to zero, for any positive constant C.
- if (const auto *AE = dyn_cast<SCEVAddExpr>(LHS);
- AE && AE->getNumOperands() == 2) {
- if (const auto *VC = dyn_cast<SCEVConstant>(AE->getOperand(0))) {
- const APInt &NegC = VC->getAPInt();
- if (NegC.isNegative() && !NegC.isMinSignedValue()) {
- const auto *MME = dyn_cast<SCEVSMaxExpr>(AE->getOperand(1));
- if (MME && MME->getNumOperands() == 2 &&
- isa<SCEVConstant>(MME->getOperand(0)) &&
- cast<SCEVConstant>(MME->getOperand(0))->getAPInt() == -NegC &&
- MME->getOperand(1) == RHS)
- return getZero(LHS->getType());
- }
- }
- }
+ const APInt *NegC, *C;
+ if (match(LHS,
+ m_scev_Add(m_scev_APInt(NegC),
+ m_scev_SMax(m_scev_APInt(C), m_scev_Specific(RHS)))) &&
+ NegC->isNegative() && !NegC->isMinSignedValue() && *C == -*NegC)
+ return getZero(LHS->getType());
// TODO: Generalize to handle any common factors.
// udiv (mul nuw a, vscale), (mul nuw b, vscale) --> udiv a, b
@@ -10791,19 +10782,15 @@ static bool HasSameValue(const SCEV *A, const SCEV *B) {
}
static bool MatchBinarySub(const SCEV *S, const SCEV *&LHS, const SCEV *&RHS) {
- const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S);
- if (!Add || Add->getNumOperands() != 2)
+ const SCEV *Op0, *Op1;
+ if (!match(S, m_scev_Add(m_SCEV(Op0), m_SCEV(Op1))))
return false;
- if (auto *ME = dyn_cast<SCEVMulExpr>(Add->getOperand(0));
- ME && ME->getNumOperands() == 2 && ME->getOperand(0)->isAllOnesValue()) {
- LHS = Add->getOperand(1);
- RHS = ME->getOperand(1);
+ if (match(Op0, m_scev_Mul(m_scev_AllOnes(), m_SCEV(RHS)))) {
+ LHS = Op1;
return true;
}
- if (auto *ME = dyn_cast<SCEVMulExpr>(Add->getOperand(1));
- ME && ME->getNumOperands() == 2 && ME->getOperand(0)->isAllOnesValue()) {
- LHS = Add->getOperand(0);
- RHS = ME->getOperand(1);
+ if (match(Op1, m_scev_Mul(m_scev_AllOnes(), m_SCEV(RHS)))) {
+ LHS = Op0;
return true;
}
return false;
@@ -12166,13 +12153,10 @@ bool ScalarEvolution::isImpliedCondBalancedTypes(
bool ScalarEvolution::splitBinaryAdd(const SCEV *Expr,
const SCEV *&L, const SCEV *&R,
SCEV::NoWrapFlags &Flags) {
- const auto *AE = dyn_cast<SCEVAddExpr>(Expr);
- if (!AE || AE->getNumOperands() != 2)
+ if (!match(Expr, m_scev_Add(m_SCEV(L), m_SCEV(R))))
return false;
- L = AE->getOperand(0);
- R = AE->getOperand(1);
- Flags = AE->getNoWrapFlags();
+ Flags = cast<SCEVAddExpr>(Expr)->getNoWrapFlags();
return true;
}
@@ -15550,19 +15534,10 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
auto IsMinMaxSCEVWithNonNegativeConstant =
[&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS,
const SCEV *&RHS) {
- if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr)) {
- if (MinMax->getNumOperands() != 2)
- return false;
- if (auto *C = dyn_cast<SCEVConstant>(MinMax->getOperand(0))) {
- if (C->getAPInt().isNegative())
- return false;
- SCTy = MinMax->getSCEVType();
- LHS = MinMax->getOperand(0);
- RHS = MinMax->getOperand(1);
- return true;
- }
- }
- return false;
+ const APInt *C;
+ SCTy = Expr->getSCEVType();
+ return match(Expr, m_scev_MinMax(m_SCEV(LHS), m_SCEV(RHS))) &&
+ match(LHS, m_scev_APInt(C)) && C->isNonNegative();
};
// Return a new SCEV that modifies \p Expr to the closest number divides by
@@ -15765,19 +15740,26 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
GetNextSCEVDividesByDivisor(One, DividesBy);
To = SE.getUMaxExpr(FromRewritten, OneAlignedUp);
} else {
+ // LHS != RHS can be rewritten as (LHS - RHS) = UMax(1, LHS - RHS),
+ // but creating the subtraction eagerly is expensive. Track the
+ // inequalities in a separate map, and materialize the rewrite lazily
+ // when encountering a suitable subtraction while re-writing.
if (LHS->getType()->isPointerTy()) {
LHS = SE.getLosslessPtrToIntExpr(LHS);
RHS = SE.getLosslessPtrToIntExpr(RHS);
if (isa<SCEVCouldNotCompute>(LHS) || isa<SCEVCouldNotCompute>(RHS))
break;
}
- auto AddSubRewrite = [&](const SCEV *A, const SCEV *B) {
- const SCEV *Sub = SE.getMinusSCEV(A, B);
- AddRewrite(Sub, Sub,
- SE.getUMaxExpr(Sub, SE.getOne(From->getType())));
- };
- AddSubRewrite(LHS, RHS);
- AddSubRewrite(RHS, LHS);
+ const SCEVConstant *C;
+ const SCEV *A, *B;
+ if (match(RHS, m_scev_Add(m_SCEVConstant(C), m_SCEV(A))) &&
+ match(LHS, m_scev_Add(m_scev_Specific(C), m_SCEV(B)))) {
+ RHS = A;
+ LHS = B;
+ }
+ if (LHS > RHS)
+ std::swap(LHS, RHS);
+ Guards.NotEqual.insert({LHS, RHS});
continue;
}
break;
@@ -15911,13 +15893,15 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const {
class SCEVLoopGuardRewriter
: public SCEVRewriteVisitor<SCEVLoopGuardRewriter> {
const DenseMap<const SCEV *, const SCEV *> &Map;
+ const SmallDenseSet<std::pair<const SCEV *, const SCEV *>> &NotEqual;
SCEV::NoWrapFlags FlagMask = SCEV::FlagAnyWrap;
public:
SCEVLoopGuardRewriter(ScalarEvolution &SE,
const ScalarEvolution::LoopGuards &Guards)
- : SCEVRewriteVisitor(SE), Map(Guards.RewriteMap) {
+ : SCEVRewriteVisitor(SE), Map(Guards.RewriteMap),
+ NotEqual(Guards.NotEqual) {
if (Guards.PreserveNUW)
FlagMask = ScalarEvolution::setFlags(FlagMask, SCEV::FlagNUW);
if (Guards.PreserveNSW)
@@ -15972,14 +15956,36 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const {
}
const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
+ // Helper to check if S is a subtraction (A - B) where A != B, and if so,
+ // return UMax(S, 1).
+ auto RewriteSubtraction = [&](const SCEV *S) -> const SCEV * {
+ const SCEV *LHS, *RHS;
+ if (MatchBinarySub(S, LHS, RHS)) {
+ if (LHS > RHS)
+ std::swap(LHS, RHS);
+ if (NotEqual.contains({LHS, RHS}))
+ return SE.getUMaxExpr(S, SE.getOne(S->getType()));
+ }
+ return nullptr;
+ };
+
+ // Check if Expr itself is a subtraction pattern with guard info.
+ if (const SCEV *Rewritten = RewriteSubtraction(Expr))
+ return Rewritten;
+
// Trip count expressions sometimes consist of adding 3 operands, i.e.
// (Const + A + B). There may be guard info for A + B, and if so, apply
// it.
// TODO: Could more generally apply guards to Add sub-expressions.
if (isa<SCEVConstant>(Expr->getOperand(0)) &&
Expr->getNumOperands() == 3) {
- if (const SCEV *S = Map.lookup(
- SE.getAddExpr(Expr->getOperand(1), Expr->getOperand(2))))
+ const SCEV *Add =
+ SE.getAddExpr(Expr->getOperand(1), Expr->getOperand(2));
+ if (const SCEV *Rewritten = RewriteSubtraction(Add))
+ return SE.getAddExpr(
+ Expr->getOperand(0), Rewritten,
+ ScalarEvolution::maskFlags(Expr->getNoWrapFlags(), FlagMask));
+ if (const SCEV *S = Map.lookup(Add))
return SE.getAddExpr(Expr->getOperand(0), S);
}
SmallVector<const SCEV *, 2> Operands;
@@ -16014,7 +16020,7 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const {
}
};
- if (RewriteMap.empty())
+ if (RewriteMap.empty() && NotEqual.empty())
return Expr;
SCEVLoopGuardRewriter Rewriter(SE, *this);
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 433877f..567acf7 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1039,12 +1039,17 @@ void DwarfDebug::finishUnitAttributes(const DICompileUnit *DIUnit,
} else
NewCU.addString(Die, dwarf::DW_AT_producer, Producer);
- if (auto Lang = DIUnit->getSourceLanguage(); Lang.hasVersionedName())
+ if (auto Lang = DIUnit->getSourceLanguage(); Lang.hasVersionedName()) {
NewCU.addUInt(Die, dwarf::DW_AT_language_name, dwarf::DW_FORM_data2,
Lang.getName());
- else
+
+ if (uint32_t LangVersion = Lang.getVersion(); LangVersion != 0)
+ NewCU.addUInt(Die, dwarf::DW_AT_language_version, /*Form=*/std::nullopt,
+ LangVersion);
+ } else {
NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
Lang.getName());
+ }
NewCU.addString(Die, dwarf::DW_AT_name, FN);
StringRef SysRoot = DIUnit->getSysRoot();
@@ -2066,11 +2071,36 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
if (NoDebug)
return;
+ auto RecordLineZero = [&]() {
+ // Preserve the file and column numbers, if we can, to save space in
+ // the encoded line table.
+ // Do not update PrevInstLoc, it remembers the last non-0 line.
+ const MDNode *Scope = nullptr;
+ unsigned Column = 0;
+ if (PrevInstLoc) {
+ Scope = PrevInstLoc.getScope();
+ Column = PrevInstLoc.getCol();
+ }
+ recordSourceLine(/*Line=*/0, Column, Scope, /*Flags=*/0);
+ };
+
+ // When we emit a line-0 record, we don't update PrevInstLoc; so look at
+ // the last line number actually emitted, to see if it was line 0.
+ unsigned LastAsmLine =
+ Asm->OutStreamer->getContext().getCurrentDwarfLoc().getLine();
+
// Check if source location changes, but ignore DBG_VALUE and CFI locations.
// If the instruction is part of the function frame setup code, do not emit
// any line record, as there is no correspondence with any user code.
- if (MI->isMetaInstruction() || MI->getFlag(MachineInstr::FrameSetup))
+ if (MI->isMetaInstruction())
+ return;
+ if (MI->getFlag(MachineInstr::FrameSetup)) {
+ // Prevent a loc from the previous block leaking into frame setup instrs.
+ if (LastAsmLine && PrevInstBB && PrevInstBB != MI->getParent())
+ RecordLineZero();
return;
+ }
+
const DebugLoc &DL = MI->getDebugLoc();
unsigned Flags = 0;
@@ -2093,11 +2123,6 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
LocationString);
};
- // When we emit a line-0 record, we don't update PrevInstLoc; so look at
- // the last line number actually emitted, to see if it was line 0.
- unsigned LastAsmLine =
- Asm->OutStreamer->getContext().getCurrentDwarfLoc().getLine();
-
// There may be a mixture of scopes using and not using Key Instructions.
// Not-Key-Instructions functions inlined into Key Instructions functions
// should use not-key is_stmt handling. Key Instructions functions inlined
@@ -2163,18 +2188,8 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
// - Instruction is at the top of a block; we don't want to inherit the
// location from the physically previous (maybe unrelated) block.
if (UnknownLocations == Enable || PrevLabel ||
- (PrevInstBB && PrevInstBB != MI->getParent())) {
- // Preserve the file and column numbers, if we can, to save space in
- // the encoded line table.
- // Do not update PrevInstLoc, it remembers the last non-0 line.
- const MDNode *Scope = nullptr;
- unsigned Column = 0;
- if (PrevInstLoc) {
- Scope = PrevInstLoc.getScope();
- Column = PrevInstLoc.getCol();
- }
- recordSourceLine(/*Line=*/0, Column, Scope, /*Flags=*/0);
- }
+ (PrevInstBB && PrevInstBB != MI->getParent()))
+ RecordLineZero();
return;
}
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index f28b989..d8374b6 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -6041,8 +6041,7 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
Triple T(TT);
// The only data layout upgrades needed for pre-GCN, SPIR or SPIRV are setting
// the address space of globals to 1. This does not apply to SPIRV Logical.
- if (((T.isAMDGPU() && !T.isAMDGCN()) ||
- (T.isSPIR() || (T.isSPIRV() && !T.isSPIRVLogical()))) &&
+ if ((T.isSPIR() || (T.isSPIRV() && !T.isSPIRVLogical())) &&
!DL.contains("-G") && !DL.starts_with("G")) {
return DL.empty() ? std::string("G1") : (DL + "-G1").str();
}
@@ -6055,35 +6054,43 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
return DL.str();
}
+ // AMDGPU data layout upgrades.
std::string Res = DL.str();
- // AMDGCN data layout upgrades.
- if (T.isAMDGCN()) {
+ if (T.isAMDGPU()) {
// Define address spaces for constants.
if (!DL.contains("-G") && !DL.starts_with("G"))
Res.append(Res.empty() ? "G1" : "-G1");
- // Add missing non-integral declarations.
- // This goes before adding new address spaces to prevent incoherent string
- // values.
- if (!DL.contains("-ni") && !DL.starts_with("ni"))
- Res.append("-ni:7:8:9");
- // Update ni:7 to ni:7:8:9.
- if (DL.ends_with("ni:7"))
- Res.append(":8:9");
- if (DL.ends_with("ni:7:8"))
- Res.append(":9");
-
- // Add sizing for address spaces 7 and 8 (fat raw buffers and buffer
- // resources) An empty data layout has already been upgraded to G1 by now.
- if (!DL.contains("-p7") && !DL.starts_with("p7"))
- Res.append("-p7:160:256:256:32");
- if (!DL.contains("-p8") && !DL.starts_with("p8"))
- Res.append("-p8:128:128:128:48");
- constexpr StringRef OldP8("-p8:128:128-");
- if (DL.contains(OldP8))
- Res.replace(Res.find(OldP8), OldP8.size(), "-p8:128:128:128:48-");
- if (!DL.contains("-p9") && !DL.starts_with("p9"))
- Res.append("-p9:192:256:256:32");
+ // AMDGCN data layout upgrades.
+ if (T.isAMDGCN()) {
+
+ // Add missing non-integral declarations.
+ // This goes before adding new address spaces to prevent incoherent string
+ // values.
+ if (!DL.contains("-ni") && !DL.starts_with("ni"))
+ Res.append("-ni:7:8:9");
+ // Update ni:7 to ni:7:8:9.
+ if (DL.ends_with("ni:7"))
+ Res.append(":8:9");
+ if (DL.ends_with("ni:7:8"))
+ Res.append(":9");
+
+ // Add sizing for address spaces 7 and 8 (fat raw buffers and buffer
+ // resources) An empty data layout has already been upgraded to G1 by now.
+ if (!DL.contains("-p7") && !DL.starts_with("p7"))
+ Res.append("-p7:160:256:256:32");
+ if (!DL.contains("-p8") && !DL.starts_with("p8"))
+ Res.append("-p8:128:128:128:48");
+ constexpr StringRef OldP8("-p8:128:128-");
+ if (DL.contains(OldP8))
+ Res.replace(Res.find(OldP8), OldP8.size(), "-p8:128:128:128:48-");
+ if (!DL.contains("-p9") && !DL.starts_with("p9"))
+ Res.append("-p9:192:256:256:32");
+ }
+
+ // Upgrade the ELF mangling mode.
+ if (!DL.contains("m:e"))
+ Res = Res.empty() ? "m:e" : "m:e-" + Res;
return Res;
}
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 9601a8a..5883606 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -294,9 +294,9 @@ void DebugInfoFinder::processSubprogram(DISubprogram *SP) {
// just DISubprogram's, referenced from anywhere within the Function being
// cloned prior to calling MapMetadata / RemapInstruction to avoid their
// duplication later as DICompileUnit's are also directly referenced by
- // llvm.dbg.cu list. Thefore we need to collect DICompileUnit's here as well.
- // Also, DICompileUnit's may reference DISubprogram's too and therefore need
- // to be at least looked through.
+ // llvm.dbg.cu list. Therefore we need to collect DICompileUnit's here as
+ // well. Also, DICompileUnit's may reference DISubprogram's too and therefore
+ // need to be at least looked through.
processCompileUnit(SP->getUnit());
processType(SP->getType());
for (auto *Element : SP->getTemplateParams()) {
@@ -377,7 +377,7 @@ bool DebugInfoFinder::addScope(DIScope *Scope) {
/// Recursively handle DILocations in followup metadata etc.
///
-/// TODO: If for example a followup loop metadata would refence itself this
+/// TODO: If for example a followup loop metadata would reference itself this
/// function would go into infinite recursion. We do not expect such cycles in
/// the loop metadata (except for the self-referencing first element
/// "LoopID"). However, we could at least handle such situations more gracefully
@@ -679,7 +679,7 @@ private:
auto Variables = nullptr;
auto TemplateParams = nullptr;
- // Make a distinct DISubprogram, for situations that warrent it.
+ // Make a distinct DISubprogram, for situations that warrant it.
auto distinctMDSubprogram = [&]() {
return DISubprogram::getDistinct(
MDS->getContext(), FileAndScope, MDS->getName(), LinkageName,
@@ -1095,6 +1095,35 @@ LLVMDIBuilderCreateFile(LLVMDIBuilderRef Builder, const char *Filename,
StringRef(Directory, DirectoryLen)));
}
+static llvm::DIFile::ChecksumKind
+map_from_llvmChecksumKind(LLVMChecksumKind CSKind) {
+ switch (CSKind) {
+ case LLVMChecksumKind::CSK_MD5:
+ return llvm::DIFile::CSK_MD5;
+ case LLVMChecksumKind::CSK_SHA1:
+ return llvm::DIFile::CSK_SHA1;
+ case LLVMChecksumKind::CSK_SHA256:
+ return llvm::DIFile::CSK_SHA256;
+ }
+ llvm_unreachable("Unhandled Checksum Kind");
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateFileWithChecksum(
+ LLVMDIBuilderRef Builder, const char *Filename, size_t FilenameLen,
+ const char *Directory, size_t DirectoryLen, LLVMChecksumKind ChecksumKind,
+ const char *Checksum, size_t ChecksumLen, const char *Source,
+ size_t SourceLen) {
+ StringRef ChkSum = StringRef(Checksum, ChecksumLen);
+ auto CSK = map_from_llvmChecksumKind(ChecksumKind);
+ llvm::DIFile::ChecksumInfo<StringRef> CSInfo(CSK, ChkSum);
+ std::optional<StringRef> Src;
+ if (SourceLen > 0)
+ Src = StringRef(Source, SourceLen);
+ return wrap(unwrap(Builder)->createFile(StringRef(Filename, FilenameLen),
+ StringRef(Directory, DirectoryLen),
+ CSInfo, Src));
+}
+
LLVMMetadataRef
LLVMDIBuilderCreateModule(LLVMDIBuilderRef Builder, LLVMMetadataRef ParentScope,
const char *Name, size_t NameLen,
@@ -2014,7 +2043,7 @@ void at::remapAssignID(DenseMap<DIAssignID *, DIAssignID *> &Map,
I.setMetadata(LLVMContext::MD_DIAssignID, GetNewID(ID));
}
-/// Collect constant properies (base, size, offset) of \p StoreDest.
+/// Collect constant properties (base, size, offset) of \p StoreDest.
/// Return std::nullopt if any properties are not constants or the
/// offset from the base pointer is negative.
static std::optional<AssignmentInfo>
@@ -2300,7 +2329,7 @@ PreservedAnalyses AssignmentTrackingPass::run(Function &F,
return PreservedAnalyses::all();
// Record that this module uses assignment tracking. It doesn't matter that
- // some functons in the module may not use it - the debug info in those
+ // some functions in the module may not use it - the debug info in those
// functions will still be handled properly.
setAssignmentTrackingModuleFlag(*F.getParent());
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index c79a950..3572852 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6479,9 +6479,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
NumRows->getZExtValue() * NumColumns->getZExtValue(),
"Result of a matrix operation does not fit in the returned vector!");
- if (Stride)
+ if (Stride) {
+ Check(Stride->getBitWidth() <= 64, "Stride bitwidth cannot exceed 64!",
+ IF);
Check(Stride->getZExtValue() >= NumRows->getZExtValue(),
"Stride must be greater or equal than the number of rows!", IF);
+ }
break;
}
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index e6544f3..aec8891 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1257,38 +1257,6 @@ Error LTO::run(AddStreamFn AddStream, FileCache Cache) {
return Result;
}
-void lto::updateMemProfAttributes(Module &Mod,
- const ModuleSummaryIndex &Index) {
- llvm::TimeTraceScope timeScope("LTO update memprof attributes");
- if (Index.withSupportsHotColdNew())
- return;
-
- // The profile matcher applies hotness attributes directly for allocations,
- // and those will cause us to generate calls to the hot/cold interfaces
- // unconditionally. If supports-hot-cold-new was not enabled in the LTO
- // link then assume we don't want these calls (e.g. not linking with
- // the appropriate library, or otherwise trying to disable this behavior).
- for (auto &F : Mod) {
- for (auto &BB : F) {
- for (auto &I : BB) {
- auto *CI = dyn_cast<CallBase>(&I);
- if (!CI)
- continue;
- if (CI->hasFnAttr("memprof"))
- CI->removeFnAttr("memprof");
- // Strip off all memprof metadata as it is no longer needed.
- // Importantly, this avoids the addition of new memprof attributes
- // after inlining propagation.
- // TODO: If we support additional types of MemProf metadata beyond hot
- // and cold, we will need to update the metadata based on the allocator
- // APIs supported instead of completely stripping all.
- CI->setMetadata(LLVMContext::MD_memprof, nullptr);
- CI->setMetadata(LLVMContext::MD_callsite, nullptr);
- }
- }
- }
-}
-
Error LTO::runRegularLTO(AddStreamFn AddStream) {
llvm::TimeTraceScope timeScope("Run regular LTO");
LLVMContext &CombinedCtx = RegularLTO.CombinedModule->getContext();
@@ -1346,8 +1314,6 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
}
}
- updateMemProfAttributes(*RegularLTO.CombinedModule, ThinLTO.CombinedIndex);
-
bool WholeProgramVisibilityEnabledInLTO =
Conf.HasWholeProgramVisibility &&
// If validation is enabled, upgrade visibility only when all vtables
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 11a7b32..280c3d1 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -726,7 +726,6 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
}
// Do this after any importing so that imported code is updated.
- updateMemProfAttributes(Mod, CombinedIndex);
updatePublicTypeTestCalls(Mod, CombinedIndex.withWholeProgramVisibility());
if (Conf.PostImportModuleHook && !Conf.PostImportModuleHook(Task, Mod))
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 53cf004..e45cac8 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -2027,13 +2027,13 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
#define LOOPNEST_PASS(NAME, CREATE_PASS) \
if (Name == NAME) { \
MPM.addPass(createModuleToFunctionPassAdaptor( \
- createFunctionToLoopPassAdaptor(CREATE_PASS, false, false))); \
+ createFunctionToLoopPassAdaptor(CREATE_PASS, false))); \
return Error::success(); \
}
#define LOOP_PASS(NAME, CREATE_PASS) \
if (Name == NAME) { \
MPM.addPass(createModuleToFunctionPassAdaptor( \
- createFunctionToLoopPassAdaptor(CREATE_PASS, false, false))); \
+ createFunctionToLoopPassAdaptor(CREATE_PASS, false))); \
return Error::success(); \
}
#define LOOP_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS) \
@@ -2041,9 +2041,8 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
auto Params = parsePassParameters(PARSER, Name, NAME); \
if (!Params) \
return Params.takeError(); \
- MPM.addPass( \
- createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor( \
- CREATE_PASS(Params.get()), false, false))); \
+ MPM.addPass(createModuleToFunctionPassAdaptor( \
+ createFunctionToLoopPassAdaptor(CREATE_PASS(Params.get()), false))); \
return Error::success(); \
}
#include "PassRegistry.def"
@@ -2142,13 +2141,13 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
#define LOOPNEST_PASS(NAME, CREATE_PASS) \
if (Name == NAME) { \
CGPM.addPass(createCGSCCToFunctionPassAdaptor( \
- createFunctionToLoopPassAdaptor(CREATE_PASS, false, false))); \
+ createFunctionToLoopPassAdaptor(CREATE_PASS, false))); \
return Error::success(); \
}
#define LOOP_PASS(NAME, CREATE_PASS) \
if (Name == NAME) { \
CGPM.addPass(createCGSCCToFunctionPassAdaptor( \
- createFunctionToLoopPassAdaptor(CREATE_PASS, false, false))); \
+ createFunctionToLoopPassAdaptor(CREATE_PASS, false))); \
return Error::success(); \
}
#define LOOP_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS) \
@@ -2156,9 +2155,8 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
auto Params = parsePassParameters(PARSER, Name, NAME); \
if (!Params) \
return Params.takeError(); \
- CGPM.addPass( \
- createCGSCCToFunctionPassAdaptor(createFunctionToLoopPassAdaptor( \
- CREATE_PASS(Params.get()), false, false))); \
+ CGPM.addPass(createCGSCCToFunctionPassAdaptor( \
+ createFunctionToLoopPassAdaptor(CREATE_PASS(Params.get()), false))); \
return Error::success(); \
}
#include "PassRegistry.def"
@@ -2191,11 +2189,8 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
return Err;
// Add the nested pass manager with the appropriate adaptor.
bool UseMemorySSA = (Name == "loop-mssa");
- bool UseBFI = llvm::any_of(InnerPipeline, [](auto Pipeline) {
- return Pipeline.Name.contains("simple-loop-unswitch");
- });
- FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), UseMemorySSA,
- UseBFI));
+ FPM.addPass(
+ createFunctionToLoopPassAdaptor(std::move(LPM), UseMemorySSA));
return Error::success();
}
if (Name == "machine-function") {
@@ -2248,12 +2243,12 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
// The risk is that it may become obsolete if we're not careful.
#define LOOPNEST_PASS(NAME, CREATE_PASS) \
if (Name == NAME) { \
- FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS, false, false)); \
+ FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS, false)); \
return Error::success(); \
}
#define LOOP_PASS(NAME, CREATE_PASS) \
if (Name == NAME) { \
- FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS, false, false)); \
+ FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS, false)); \
return Error::success(); \
}
#define LOOP_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS) \
@@ -2261,8 +2256,8 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
auto Params = parsePassParameters(PARSER, Name, NAME); \
if (!Params) \
return Params.takeError(); \
- FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS(Params.get()), \
- false, false)); \
+ FPM.addPass( \
+ createFunctionToLoopPassAdaptor(CREATE_PASS(Params.get()), false)); \
return Error::success(); \
}
#include "PassRegistry.def"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index fea0d25..bd03ac0 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -519,16 +519,14 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
invokeLoopOptimizerEndEPCallbacks(LPM2, Level);
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
- /*UseMemorySSA=*/true,
- /*UseBlockFrequencyInfo=*/true));
+ /*UseMemorySSA=*/true));
FPM.addPass(
SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
FPM.addPass(InstCombinePass());
// The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
// *All* loop passes must preserve it, in order to be able to use it.
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
- /*UseMemorySSA=*/false,
- /*UseBlockFrequencyInfo=*/false));
+ /*UseMemorySSA=*/false));
// Delete small array after loop unroll.
FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
@@ -710,8 +708,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
invokeLoopOptimizerEndEPCallbacks(LPM2, Level);
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
- /*UseMemorySSA=*/true,
- /*UseBlockFrequencyInfo=*/true));
+ /*UseMemorySSA=*/true));
FPM.addPass(
SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
FPM.addPass(InstCombinePass());
@@ -719,8 +716,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
// LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
// *All* loop passes must preserve it, in order to be able to use it.
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
- /*UseMemorySSA=*/false,
- /*UseBlockFrequencyInfo=*/false));
+ /*UseMemorySSA=*/false));
// Delete small array after loop unroll.
FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
@@ -773,7 +769,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
FPM.addPass(createFunctionToLoopPassAdaptor(
LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
/*AllowSpeculation=*/true),
- /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
+ /*UseMemorySSA=*/true));
FPM.addPass(CoroElidePass());
@@ -842,8 +838,7 @@ void PassBuilder::addPostPGOLoopRotation(ModulePassManager &MPM,
createFunctionToLoopPassAdaptor(
LoopRotatePass(EnableLoopHeaderDuplication ||
Level != OptimizationLevel::Oz),
- /*UseMemorySSA=*/false,
- /*UseBlockFrequencyInfo=*/false),
+ /*UseMemorySSA=*/false),
PTO.EagerlyInvalidateAnalyses));
}
}
@@ -1358,8 +1353,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
OptimizationLevel::O3));
ExtraPasses.addPass(
- createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true,
- /*UseBlockFrequencyInfo=*/true));
+ createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true));
ExtraPasses.addPass(
SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
ExtraPasses.addPass(InstCombinePass());
@@ -1438,7 +1432,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
FPM.addPass(createFunctionToLoopPassAdaptor(
LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
/*AllowSpeculation=*/true),
- /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
+ /*UseMemorySSA=*/true));
// Now that we've vectorized and unrolled loops, we may have more refined
// alignment information, try to re-derive it here.
@@ -1520,7 +1514,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
OptimizePM.addPass(createFunctionToLoopPassAdaptor(
LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
/*AllowSpeculation=*/true),
- /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
+ /*USeMemorySSA=*/true));
}
OptimizePM.addPass(Float2IntPass());
@@ -1560,8 +1554,8 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
if (PTO.LoopInterchange)
LPM.addPass(LoopInterchangePass());
- OptimizePM.addPass(createFunctionToLoopPassAdaptor(
- std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false));
+ OptimizePM.addPass(
+ createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false));
// FIXME: This may not be the right place in the pipeline.
// We need to have the data to support the right place.
@@ -1658,6 +1652,16 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
ModulePassManager MPM;
+ // Currently this pipeline is only invoked in an LTO pre link pass or when we
+ // are not running LTO. If that changes the below checks may need updating.
+ assert(isLTOPreLink(Phase) || Phase == ThinOrFullLTOPhase::None);
+
+ // If we are invoking this in non-LTO mode, remove any MemProf related
+ // attributes and metadata, as we don't know whether we are linking with
+ // a library containing the necessary interfaces.
+ if (Phase == ThinOrFullLTOPhase::None)
+ MPM.addPass(MemProfRemoveInfo());
+
// Convert @llvm.global.annotations to !annotation metadata.
MPM.addPass(Annotation2MetadataPass());
@@ -1803,6 +1807,12 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) {
ModulePassManager MPM;
+ // If we are invoking this without a summary index noting that we are linking
+ // with a library containing the necessary APIs, remove any MemProf related
+ // attributes and metadata.
+ if (!ImportSummary || !ImportSummary->withSupportsHotColdNew())
+ MPM.addPass(MemProfRemoveInfo());
+
if (ImportSummary) {
// For ThinLTO we must apply the context disambiguation decisions early, to
// ensure we can correctly match the callsites to summary data.
@@ -1874,6 +1884,12 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
invokeFullLinkTimeOptimizationEarlyEPCallbacks(MPM, Level);
+ // If we are invoking this without a summary index noting that we are linking
+ // with a library containing the necessary APIs, remove any MemProf related
+ // attributes and metadata.
+ if (!ExportSummary || !ExportSummary->withSupportsHotColdNew())
+ MPM.addPass(MemProfRemoveInfo());
+
// Create a function that performs CFI checks for cross-DSO calls with targets
// in the current module.
MPM.addPass(CrossDSOCFIPass());
@@ -2111,7 +2127,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
MainFPM.addPass(createFunctionToLoopPassAdaptor(
LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
/*AllowSpeculation=*/true),
- /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
+ /*USeMemorySSA=*/true));
if (RunNewGVN)
MainFPM.addPass(NewGVNPass());
@@ -2141,8 +2157,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
PTO.ForgetAllSCEVInLoopUnroll));
// The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA.
// *All* loop passes must preserve it, in order to be able to use it.
- MainFPM.addPass(createFunctionToLoopPassAdaptor(
- std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true));
+ MainFPM.addPass(
+ createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false));
MainFPM.addPass(LoopDistributePass());
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 1b16525..884d8da 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -113,6 +113,7 @@ MODULE_PASS("pgo-force-function-attrs",
? PGOOpt->ColdOptType
: PGOOptions::ColdFuncOpt::Default))
MODULE_PASS("memprof-context-disambiguation", MemProfContextDisambiguation())
+MODULE_PASS("memprof-remove-attributes", MemProfRemoveInfo())
MODULE_PASS("memprof-module", ModuleMemProfilerPass())
MODULE_PASS("mergefunc", MergeFunctionsPass())
MODULE_PASS("metarenamer", MetaRenamerPass())
diff --git a/llvm/lib/Remarks/BitstreamRemarkParser.h b/llvm/lib/Remarks/BitstreamRemarkParser.h
index 4f66c47..914edd8 100644
--- a/llvm/lib/Remarks/BitstreamRemarkParser.h
+++ b/llvm/lib/Remarks/BitstreamRemarkParser.h
@@ -112,7 +112,7 @@ public:
/// Helper to parse a META_BLOCK for a bitstream remark container.
class BitstreamMetaParserHelper
: public BitstreamBlockParserHelper<BitstreamMetaParserHelper> {
- friend class BitstreamBlockParserHelper;
+ friend class BitstreamBlockParserHelper<BitstreamMetaParserHelper>;
public:
struct ContainerInfo {
@@ -137,7 +137,7 @@ protected:
/// Helper to parse a REMARK_BLOCK for a bitstream remark container.
class BitstreamRemarkParserHelper
: public BitstreamBlockParserHelper<BitstreamRemarkParserHelper> {
- friend class BitstreamBlockParserHelper;
+ friend class BitstreamBlockParserHelper<BitstreamRemarkParserHelper>;
protected:
SmallVector<uint64_t, 5> Record;
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 8623c06..b4de79a 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -130,44 +130,46 @@ struct fltSemantics {
bool hasSignBitInMSB = true;
};
-static constexpr fltSemantics semIEEEhalf = {15, -14, 11, 16};
-static constexpr fltSemantics semBFloat = {127, -126, 8, 16};
-static constexpr fltSemantics semIEEEsingle = {127, -126, 24, 32};
-static constexpr fltSemantics semIEEEdouble = {1023, -1022, 53, 64};
-static constexpr fltSemantics semIEEEquad = {16383, -16382, 113, 128};
-static constexpr fltSemantics semFloat8E5M2 = {15, -14, 3, 8};
-static constexpr fltSemantics semFloat8E5M2FNUZ = {
+constexpr fltSemantics APFloatBase::semIEEEhalf = {15, -14, 11, 16};
+constexpr fltSemantics APFloatBase::semBFloat = {127, -126, 8, 16};
+constexpr fltSemantics APFloatBase::semIEEEsingle = {127, -126, 24, 32};
+constexpr fltSemantics APFloatBase::semIEEEdouble = {1023, -1022, 53, 64};
+constexpr fltSemantics APFloatBase::semIEEEquad = {16383, -16382, 113, 128};
+constexpr fltSemantics APFloatBase::semFloat8E5M2 = {15, -14, 3, 8};
+constexpr fltSemantics APFloatBase::semFloat8E5M2FNUZ = {
15, -15, 3, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
-static constexpr fltSemantics semFloat8E4M3 = {7, -6, 4, 8};
-static constexpr fltSemantics semFloat8E4M3FN = {
+constexpr fltSemantics APFloatBase::semFloat8E4M3 = {7, -6, 4, 8};
+constexpr fltSemantics APFloatBase::semFloat8E4M3FN = {
8, -6, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::AllOnes};
-static constexpr fltSemantics semFloat8E4M3FNUZ = {
+constexpr fltSemantics APFloatBase::semFloat8E4M3FNUZ = {
7, -7, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
-static constexpr fltSemantics semFloat8E4M3B11FNUZ = {
+constexpr fltSemantics APFloatBase::semFloat8E4M3B11FNUZ = {
4, -10, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
-static constexpr fltSemantics semFloat8E3M4 = {3, -2, 5, 8};
-static constexpr fltSemantics semFloatTF32 = {127, -126, 11, 19};
-static constexpr fltSemantics semFloat8E8M0FNU = {127,
- -127,
- 1,
- 8,
- fltNonfiniteBehavior::NanOnly,
- fltNanEncoding::AllOnes,
- false,
- false,
- false};
-
-static constexpr fltSemantics semFloat6E3M2FN = {
+constexpr fltSemantics APFloatBase::semFloat8E3M4 = {3, -2, 5, 8};
+constexpr fltSemantics APFloatBase::semFloatTF32 = {127, -126, 11, 19};
+constexpr fltSemantics APFloatBase::semFloat8E8M0FNU = {
+ 127,
+ -127,
+ 1,
+ 8,
+ fltNonfiniteBehavior::NanOnly,
+ fltNanEncoding::AllOnes,
+ false,
+ false,
+ false};
+
+constexpr fltSemantics APFloatBase::semFloat6E3M2FN = {
4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly};
-static constexpr fltSemantics semFloat6E2M3FN = {
+constexpr fltSemantics APFloatBase::semFloat6E2M3FN = {
2, 0, 4, 6, fltNonfiniteBehavior::FiniteOnly};
-static constexpr fltSemantics semFloat4E2M1FN = {
+constexpr fltSemantics APFloatBase::semFloat4E2M1FN = {
2, 0, 2, 4, fltNonfiniteBehavior::FiniteOnly};
-static constexpr fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80};
-static constexpr fltSemantics semBogus = {0, 0, 0, 0};
-static constexpr fltSemantics semPPCDoubleDouble = {-1, 0, 0, 128};
-static constexpr fltSemantics semPPCDoubleDoubleLegacy = {1023, -1022 + 53,
- 53 + 53, 128};
+constexpr fltSemantics APFloatBase::semX87DoubleExtended = {16383, -16382, 64,
+ 80};
+constexpr fltSemantics APFloatBase::semBogus = {0, 0, 0, 0};
+constexpr fltSemantics APFloatBase::semPPCDoubleDouble = {-1, 0, 0, 128};
+constexpr fltSemantics APFloatBase::semPPCDoubleDoubleLegacy = {
+ 1023, -1022 + 53, 53 + 53, 128};
const llvm::fltSemantics &APFloatBase::EnumToSemantics(Semantics S) {
switch (S) {
@@ -261,36 +263,6 @@ APFloatBase::SemanticsToEnum(const llvm::fltSemantics &Sem) {
llvm_unreachable("Unknown floating semantics");
}
-const fltSemantics &APFloatBase::IEEEhalf() { return semIEEEhalf; }
-const fltSemantics &APFloatBase::BFloat() { return semBFloat; }
-const fltSemantics &APFloatBase::IEEEsingle() { return semIEEEsingle; }
-const fltSemantics &APFloatBase::IEEEdouble() { return semIEEEdouble; }
-const fltSemantics &APFloatBase::IEEEquad() { return semIEEEquad; }
-const fltSemantics &APFloatBase::PPCDoubleDouble() {
- return semPPCDoubleDouble;
-}
-const fltSemantics &APFloatBase::PPCDoubleDoubleLegacy() {
- return semPPCDoubleDoubleLegacy;
-}
-const fltSemantics &APFloatBase::Float8E5M2() { return semFloat8E5M2; }
-const fltSemantics &APFloatBase::Float8E5M2FNUZ() { return semFloat8E5M2FNUZ; }
-const fltSemantics &APFloatBase::Float8E4M3() { return semFloat8E4M3; }
-const fltSemantics &APFloatBase::Float8E4M3FN() { return semFloat8E4M3FN; }
-const fltSemantics &APFloatBase::Float8E4M3FNUZ() { return semFloat8E4M3FNUZ; }
-const fltSemantics &APFloatBase::Float8E4M3B11FNUZ() {
- return semFloat8E4M3B11FNUZ;
-}
-const fltSemantics &APFloatBase::Float8E3M4() { return semFloat8E3M4; }
-const fltSemantics &APFloatBase::FloatTF32() { return semFloatTF32; }
-const fltSemantics &APFloatBase::Float8E8M0FNU() { return semFloat8E8M0FNU; }
-const fltSemantics &APFloatBase::Float6E3M2FN() { return semFloat6E3M2FN; }
-const fltSemantics &APFloatBase::Float6E2M3FN() { return semFloat6E2M3FN; }
-const fltSemantics &APFloatBase::Float4E2M1FN() { return semFloat4E2M1FN; }
-const fltSemantics &APFloatBase::x87DoubleExtended() {
- return semX87DoubleExtended;
-}
-const fltSemantics &APFloatBase::Bogus() { return semBogus; }
-
bool APFloatBase::isRepresentableBy(const fltSemantics &A,
const fltSemantics &B) {
return A.maxExponent <= B.maxExponent && A.minExponent >= B.minExponent &&
@@ -1029,7 +1001,7 @@ void IEEEFloat::makeNaN(bool SNaN, bool Negative, const APInt *fill) {
// For x87 extended precision, we want to make a NaN, not a
// pseudo-NaN. Maybe we should expose the ability to make
// pseudo-NaNs?
- if (semantics == &semX87DoubleExtended)
+ if (semantics == &APFloatBase::semX87DoubleExtended)
APInt::tcSetBit(significand, QNaNBit + 1);
}
@@ -1054,7 +1026,7 @@ IEEEFloat &IEEEFloat::operator=(IEEEFloat &&rhs) {
category = rhs.category;
sign = rhs.sign;
- rhs.semantics = &semBogus;
+ rhs.semantics = &APFloatBase::semBogus;
return *this;
}
@@ -1247,7 +1219,7 @@ IEEEFloat::IEEEFloat(const IEEEFloat &rhs) {
assign(rhs);
}
-IEEEFloat::IEEEFloat(IEEEFloat &&rhs) : semantics(&semBogus) {
+IEEEFloat::IEEEFloat(IEEEFloat &&rhs) : semantics(&APFloatBase::semBogus) {
*this = std::move(rhs);
}
@@ -2607,8 +2579,8 @@ APFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics,
shift = toSemantics.precision - fromSemantics.precision;
bool X86SpecialNan = false;
- if (&fromSemantics == &semX87DoubleExtended &&
- &toSemantics != &semX87DoubleExtended && category == fcNaN &&
+ if (&fromSemantics == &APFloatBase::semX87DoubleExtended &&
+ &toSemantics != &APFloatBase::semX87DoubleExtended && category == fcNaN &&
(!(*significandParts() & 0x8000000000000000ULL) ||
!(*significandParts() & 0x4000000000000000ULL))) {
// x86 has some unusual NaNs which cannot be represented in any other
@@ -2694,7 +2666,7 @@ APFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics,
// For x87 extended precision, we want to make a NaN, not a special NaN if
// the input wasn't special either.
- if (!X86SpecialNan && semantics == &semX87DoubleExtended)
+ if (!X86SpecialNan && semantics == &APFloatBase::semX87DoubleExtended)
APInt::tcSetBit(significandParts(), semantics->precision - 1);
// Convert of sNaN creates qNaN and raises an exception (invalid op).
@@ -3530,7 +3502,8 @@ hash_code hash_value(const IEEEFloat &Arg) {
// the actual IEEE respresentations. We compensate for that here.
APInt IEEEFloat::convertF80LongDoubleAPFloatToAPInt() const {
- assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended);
+ assert(semantics ==
+ (const llvm::fltSemantics *)&APFloatBase::semX87DoubleExtended);
assert(partCount()==2);
uint64_t myexponent, mysignificand;
@@ -3560,7 +3533,8 @@ APInt IEEEFloat::convertF80LongDoubleAPFloatToAPInt() const {
}
APInt IEEEFloat::convertPPCDoubleDoubleLegacyAPFloatToAPInt() const {
- assert(semantics == (const llvm::fltSemantics *)&semPPCDoubleDoubleLegacy);
+ assert(semantics ==
+ (const llvm::fltSemantics *)&APFloatBase::semPPCDoubleDoubleLegacy);
assert(partCount()==2);
uint64_t words[2];
@@ -3574,14 +3548,14 @@ APInt IEEEFloat::convertPPCDoubleDoubleLegacyAPFloatToAPInt() const {
// Declare fltSemantics before APFloat that uses it (and
// saves pointer to it) to ensure correct destruction order.
fltSemantics extendedSemantics = *semantics;
- extendedSemantics.minExponent = semIEEEdouble.minExponent;
+ extendedSemantics.minExponent = APFloatBase::semIEEEdouble.minExponent;
IEEEFloat extended(*this);
fs = extended.convert(extendedSemantics, rmNearestTiesToEven, &losesInfo);
assert(fs == opOK && !losesInfo);
(void)fs;
IEEEFloat u(extended);
- fs = u.convert(semIEEEdouble, rmNearestTiesToEven, &losesInfo);
+ fs = u.convert(APFloatBase::semIEEEdouble, rmNearestTiesToEven, &losesInfo);
assert(fs == opOK || fs == opInexact);
(void)fs;
words[0] = *u.convertDoubleAPFloatToAPInt().getRawData();
@@ -3597,7 +3571,7 @@ APInt IEEEFloat::convertPPCDoubleDoubleLegacyAPFloatToAPInt() const {
IEEEFloat v(extended);
v.subtract(u, rmNearestTiesToEven);
- fs = v.convert(semIEEEdouble, rmNearestTiesToEven, &losesInfo);
+ fs = v.convert(APFloatBase::semIEEEdouble, rmNearestTiesToEven, &losesInfo);
assert(fs == opOK && !losesInfo);
(void)fs;
words[1] = *v.convertDoubleAPFloatToAPInt().getRawData();
@@ -3611,8 +3585,9 @@ APInt IEEEFloat::convertPPCDoubleDoubleLegacyAPFloatToAPInt() const {
template <const fltSemantics &S>
APInt IEEEFloat::convertIEEEFloatToAPInt() const {
assert(semantics == &S);
- const int bias =
- (semantics == &semFloat8E8M0FNU) ? -S.minExponent : -(S.minExponent - 1);
+ const int bias = (semantics == &APFloatBase::semFloat8E8M0FNU)
+ ? -S.minExponent
+ : -(S.minExponent - 1);
constexpr unsigned int trailing_significand_bits = S.precision - 1;
constexpr int integer_bit_part = trailing_significand_bits / integerPartWidth;
constexpr integerPart integer_bit =
@@ -3677,87 +3652,87 @@ APInt IEEEFloat::convertIEEEFloatToAPInt() const {
APInt IEEEFloat::convertQuadrupleAPFloatToAPInt() const {
assert(partCount() == 2);
- return convertIEEEFloatToAPInt<semIEEEquad>();
+ return convertIEEEFloatToAPInt<APFloatBase::semIEEEquad>();
}
APInt IEEEFloat::convertDoubleAPFloatToAPInt() const {
assert(partCount()==1);
- return convertIEEEFloatToAPInt<semIEEEdouble>();
+ return convertIEEEFloatToAPInt<APFloatBase::semIEEEdouble>();
}
APInt IEEEFloat::convertFloatAPFloatToAPInt() const {
assert(partCount()==1);
- return convertIEEEFloatToAPInt<semIEEEsingle>();
+ return convertIEEEFloatToAPInt<APFloatBase::semIEEEsingle>();
}
APInt IEEEFloat::convertBFloatAPFloatToAPInt() const {
assert(partCount() == 1);
- return convertIEEEFloatToAPInt<semBFloat>();
+ return convertIEEEFloatToAPInt<APFloatBase::semBFloat>();
}
APInt IEEEFloat::convertHalfAPFloatToAPInt() const {
assert(partCount()==1);
- return convertIEEEFloatToAPInt<semIEEEhalf>();
+ return convertIEEEFloatToAPInt<APFloatBase::APFloatBase::semIEEEhalf>();
}
APInt IEEEFloat::convertFloat8E5M2APFloatToAPInt() const {
assert(partCount() == 1);
- return convertIEEEFloatToAPInt<semFloat8E5M2>();
+ return convertIEEEFloatToAPInt<APFloatBase::semFloat8E5M2>();
}
APInt IEEEFloat::convertFloat8E5M2FNUZAPFloatToAPInt() const {
assert(partCount() == 1);
- return convertIEEEFloatToAPInt<semFloat8E5M2FNUZ>();
+ return convertIEEEFloatToAPInt<APFloatBase::semFloat8E5M2FNUZ>();
}
APInt IEEEFloat::convertFloat8E4M3APFloatToAPInt() const {
assert(partCount() == 1);
- return convertIEEEFloatToAPInt<semFloat8E4M3>();
+ return convertIEEEFloatToAPInt<APFloatBase::semFloat8E4M3>();
}
APInt IEEEFloat::convertFloat8E4M3FNAPFloatToAPInt() const {
assert(partCount() == 1);
- return convertIEEEFloatToAPInt<semFloat8E4M3FN>();
+ return convertIEEEFloatToAPInt<APFloatBase::semFloat8E4M3FN>();
}
APInt IEEEFloat::convertFloat8E4M3FNUZAPFloatToAPInt() const {
assert(partCount() == 1);
- return convertIEEEFloatToAPInt<semFloat8E4M3FNUZ>();
+ return convertIEEEFloatToAPInt<APFloatBase::semFloat8E4M3FNUZ>();
}
APInt IEEEFloat::convertFloat8E4M3B11FNUZAPFloatToAPInt() const {
assert(partCount() == 1);
- return convertIEEEFloatToAPInt<semFloat8E4M3B11FNUZ>();
+ return convertIEEEFloatToAPInt<APFloatBase::semFloat8E4M3B11FNUZ>();
}
APInt IEEEFloat::convertFloat8E3M4APFloatToAPInt() const {
assert(partCount() == 1);
- return convertIEEEFloatToAPInt<semFloat8E3M4>();
+ return convertIEEEFloatToAPInt<APFloatBase::semFloat8E3M4>();
}
APInt IEEEFloat::convertFloatTF32APFloatToAPInt() const {
assert(partCount() == 1);
- return convertIEEEFloatToAPInt<semFloatTF32>();
+ return convertIEEEFloatToAPInt<APFloatBase::semFloatTF32>();
}
APInt IEEEFloat::convertFloat8E8M0FNUAPFloatToAPInt() const {
assert(partCount() == 1);
- return convertIEEEFloatToAPInt<semFloat8E8M0FNU>();
+ return convertIEEEFloatToAPInt<APFloatBase::semFloat8E8M0FNU>();
}
APInt IEEEFloat::convertFloat6E3M2FNAPFloatToAPInt() const {
assert(partCount() == 1);
- return convertIEEEFloatToAPInt<semFloat6E3M2FN>();
+ return convertIEEEFloatToAPInt<APFloatBase::semFloat6E3M2FN>();
}
APInt IEEEFloat::convertFloat6E2M3FNAPFloatToAPInt() const {
assert(partCount() == 1);
- return convertIEEEFloatToAPInt<semFloat6E2M3FN>();
+ return convertIEEEFloatToAPInt<APFloatBase::semFloat6E2M3FN>();
}
APInt IEEEFloat::convertFloat4E2M1FNAPFloatToAPInt() const {
assert(partCount() == 1);
- return convertIEEEFloatToAPInt<semFloat4E2M1FN>();
+ return convertIEEEFloatToAPInt<APFloatBase::semFloat4E2M1FN>();
}
// This function creates an APInt that is just a bit map of the floating
@@ -3765,74 +3740,77 @@ APInt IEEEFloat::convertFloat4E2M1FNAPFloatToAPInt() const {
// and treating the result as a normal integer is unlikely to be useful.
APInt IEEEFloat::bitcastToAPInt() const {
- if (semantics == (const llvm::fltSemantics*)&semIEEEhalf)
+ if (semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEhalf)
return convertHalfAPFloatToAPInt();
- if (semantics == (const llvm::fltSemantics *)&semBFloat)
+ if (semantics == (const llvm::fltSemantics *)&APFloatBase::semBFloat)
return convertBFloatAPFloatToAPInt();
- if (semantics == (const llvm::fltSemantics*)&semIEEEsingle)
+ if (semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEsingle)
return convertFloatAPFloatToAPInt();
- if (semantics == (const llvm::fltSemantics*)&semIEEEdouble)
+ if (semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEdouble)
return convertDoubleAPFloatToAPInt();
- if (semantics == (const llvm::fltSemantics*)&semIEEEquad)
+ if (semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEquad)
return convertQuadrupleAPFloatToAPInt();
- if (semantics == (const llvm::fltSemantics *)&semPPCDoubleDoubleLegacy)
+ if (semantics ==
+ (const llvm::fltSemantics *)&APFloatBase::semPPCDoubleDoubleLegacy)
return convertPPCDoubleDoubleLegacyAPFloatToAPInt();
- if (semantics == (const llvm::fltSemantics *)&semFloat8E5M2)
+ if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E5M2)
return convertFloat8E5M2APFloatToAPInt();
- if (semantics == (const llvm::fltSemantics *)&semFloat8E5M2FNUZ)
+ if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E5M2FNUZ)
return convertFloat8E5M2FNUZAPFloatToAPInt();
- if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3)
+ if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E4M3)
return convertFloat8E4M3APFloatToAPInt();
- if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3FN)
+ if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E4M3FN)
return convertFloat8E4M3FNAPFloatToAPInt();
- if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3FNUZ)
+ if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E4M3FNUZ)
return convertFloat8E4M3FNUZAPFloatToAPInt();
- if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3B11FNUZ)
+ if (semantics ==
+ (const llvm::fltSemantics *)&APFloatBase::semFloat8E4M3B11FNUZ)
return convertFloat8E4M3B11FNUZAPFloatToAPInt();
- if (semantics == (const llvm::fltSemantics *)&semFloat8E3M4)
+ if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E3M4)
return convertFloat8E3M4APFloatToAPInt();
- if (semantics == (const llvm::fltSemantics *)&semFloatTF32)
+ if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloatTF32)
return convertFloatTF32APFloatToAPInt();
- if (semantics == (const llvm::fltSemantics *)&semFloat8E8M0FNU)
+ if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E8M0FNU)
return convertFloat8E8M0FNUAPFloatToAPInt();
- if (semantics == (const llvm::fltSemantics *)&semFloat6E3M2FN)
+ if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat6E3M2FN)
return convertFloat6E3M2FNAPFloatToAPInt();
- if (semantics == (const llvm::fltSemantics *)&semFloat6E2M3FN)
+ if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat6E2M3FN)
return convertFloat6E2M3FNAPFloatToAPInt();
- if (semantics == (const llvm::fltSemantics *)&semFloat4E2M1FN)
+ if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat4E2M1FN)
return convertFloat4E2M1FNAPFloatToAPInt();
- assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended &&
+ assert(semantics ==
+ (const llvm::fltSemantics *)&APFloatBase::semX87DoubleExtended &&
"unknown format!");
return convertF80LongDoubleAPFloatToAPInt();
}
float IEEEFloat::convertToFloat() const {
- assert(semantics == (const llvm::fltSemantics*)&semIEEEsingle &&
+ assert(semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEsingle &&
"Float semantics are not IEEEsingle");
APInt api = bitcastToAPInt();
return api.bitsToFloat();
}
double IEEEFloat::convertToDouble() const {
- assert(semantics == (const llvm::fltSemantics*)&semIEEEdouble &&
+ assert(semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEdouble &&
"Float semantics are not IEEEdouble");
APInt api = bitcastToAPInt();
return api.bitsToDouble();
@@ -3840,7 +3818,7 @@ double IEEEFloat::convertToDouble() const {
#ifdef HAS_IEE754_FLOAT128
float128 IEEEFloat::convertToQuad() const {
- assert(semantics == (const llvm::fltSemantics *)&semIEEEquad &&
+ assert(semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEquad &&
"Float semantics are not IEEEquads");
APInt api = bitcastToAPInt();
return api.bitsToQuad();
@@ -3861,7 +3839,7 @@ void IEEEFloat::initFromF80LongDoubleAPInt(const APInt &api) {
uint64_t mysignificand = i1;
uint8_t myintegerbit = mysignificand >> 63;
- initialize(&semX87DoubleExtended);
+ initialize(&APFloatBase::semX87DoubleExtended);
assert(partCount()==2);
sign = static_cast<unsigned int>(i2>>15);
@@ -3893,14 +3871,16 @@ void IEEEFloat::initFromPPCDoubleDoubleLegacyAPInt(const APInt &api) {
// Get the first double and convert to our format.
initFromDoubleAPInt(APInt(64, i1));
- fs = convert(semPPCDoubleDoubleLegacy, rmNearestTiesToEven, &losesInfo);
+ fs = convert(APFloatBase::semPPCDoubleDoubleLegacy, rmNearestTiesToEven,
+ &losesInfo);
assert(fs == opOK && !losesInfo);
(void)fs;
// Unless we have a special case, add in second double.
if (isFiniteNonZero()) {
- IEEEFloat v(semIEEEdouble, APInt(64, i2));
- fs = v.convert(semPPCDoubleDoubleLegacy, rmNearestTiesToEven, &losesInfo);
+ IEEEFloat v(APFloatBase::semIEEEdouble, APInt(64, i2));
+ fs = v.convert(APFloatBase::semPPCDoubleDoubleLegacy, rmNearestTiesToEven,
+ &losesInfo);
assert(fs == opOK && !losesInfo);
(void)fs;
@@ -3918,7 +3898,7 @@ void IEEEFloat::initFromFloat8E8M0FNUAPInt(const APInt &api) {
uint64_t val = api.getRawData()[0];
uint64_t myexponent = (val & exponent_mask);
- initialize(&semFloat8E8M0FNU);
+ initialize(&APFloatBase::semFloat8E8M0FNU);
assert(partCount() == 1);
// This format has unsigned representation only
@@ -4025,109 +4005,109 @@ void IEEEFloat::initFromIEEEAPInt(const APInt &api) {
}
void IEEEFloat::initFromQuadrupleAPInt(const APInt &api) {
- initFromIEEEAPInt<semIEEEquad>(api);
+ initFromIEEEAPInt<APFloatBase::semIEEEquad>(api);
}
void IEEEFloat::initFromDoubleAPInt(const APInt &api) {
- initFromIEEEAPInt<semIEEEdouble>(api);
+ initFromIEEEAPInt<APFloatBase::semIEEEdouble>(api);
}
void IEEEFloat::initFromFloatAPInt(const APInt &api) {
- initFromIEEEAPInt<semIEEEsingle>(api);
+ initFromIEEEAPInt<APFloatBase::semIEEEsingle>(api);
}
void IEEEFloat::initFromBFloatAPInt(const APInt &api) {
- initFromIEEEAPInt<semBFloat>(api);
+ initFromIEEEAPInt<APFloatBase::semBFloat>(api);
}
void IEEEFloat::initFromHalfAPInt(const APInt &api) {
- initFromIEEEAPInt<semIEEEhalf>(api);
+ initFromIEEEAPInt<APFloatBase::semIEEEhalf>(api);
}
void IEEEFloat::initFromFloat8E5M2APInt(const APInt &api) {
- initFromIEEEAPInt<semFloat8E5M2>(api);
+ initFromIEEEAPInt<APFloatBase::semFloat8E5M2>(api);
}
void IEEEFloat::initFromFloat8E5M2FNUZAPInt(const APInt &api) {
- initFromIEEEAPInt<semFloat8E5M2FNUZ>(api);
+ initFromIEEEAPInt<APFloatBase::semFloat8E5M2FNUZ>(api);
}
void IEEEFloat::initFromFloat8E4M3APInt(const APInt &api) {
- initFromIEEEAPInt<semFloat8E4M3>(api);
+ initFromIEEEAPInt<APFloatBase::semFloat8E4M3>(api);
}
void IEEEFloat::initFromFloat8E4M3FNAPInt(const APInt &api) {
- initFromIEEEAPInt<semFloat8E4M3FN>(api);
+ initFromIEEEAPInt<APFloatBase::semFloat8E4M3FN>(api);
}
void IEEEFloat::initFromFloat8E4M3FNUZAPInt(const APInt &api) {
- initFromIEEEAPInt<semFloat8E4M3FNUZ>(api);
+ initFromIEEEAPInt<APFloatBase::semFloat8E4M3FNUZ>(api);
}
void IEEEFloat::initFromFloat8E4M3B11FNUZAPInt(const APInt &api) {
- initFromIEEEAPInt<semFloat8E4M3B11FNUZ>(api);
+ initFromIEEEAPInt<APFloatBase::semFloat8E4M3B11FNUZ>(api);
}
void IEEEFloat::initFromFloat8E3M4APInt(const APInt &api) {
- initFromIEEEAPInt<semFloat8E3M4>(api);
+ initFromIEEEAPInt<APFloatBase::semFloat8E3M4>(api);
}
void IEEEFloat::initFromFloatTF32APInt(const APInt &api) {
- initFromIEEEAPInt<semFloatTF32>(api);
+ initFromIEEEAPInt<APFloatBase::semFloatTF32>(api);
}
void IEEEFloat::initFromFloat6E3M2FNAPInt(const APInt &api) {
- initFromIEEEAPInt<semFloat6E3M2FN>(api);
+ initFromIEEEAPInt<APFloatBase::semFloat6E3M2FN>(api);
}
void IEEEFloat::initFromFloat6E2M3FNAPInt(const APInt &api) {
- initFromIEEEAPInt<semFloat6E2M3FN>(api);
+ initFromIEEEAPInt<APFloatBase::semFloat6E2M3FN>(api);
}
void IEEEFloat::initFromFloat4E2M1FNAPInt(const APInt &api) {
- initFromIEEEAPInt<semFloat4E2M1FN>(api);
+ initFromIEEEAPInt<APFloatBase::semFloat4E2M1FN>(api);
}
/// Treat api as containing the bits of a floating point number.
void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
assert(api.getBitWidth() == Sem->sizeInBits);
- if (Sem == &semIEEEhalf)
+ if (Sem == &APFloatBase::semIEEEhalf)
return initFromHalfAPInt(api);
- if (Sem == &semBFloat)
+ if (Sem == &APFloatBase::semBFloat)
return initFromBFloatAPInt(api);
- if (Sem == &semIEEEsingle)
+ if (Sem == &APFloatBase::semIEEEsingle)
return initFromFloatAPInt(api);
- if (Sem == &semIEEEdouble)
+ if (Sem == &APFloatBase::semIEEEdouble)
return initFromDoubleAPInt(api);
- if (Sem == &semX87DoubleExtended)
+ if (Sem == &APFloatBase::semX87DoubleExtended)
return initFromF80LongDoubleAPInt(api);
- if (Sem == &semIEEEquad)
+ if (Sem == &APFloatBase::semIEEEquad)
return initFromQuadrupleAPInt(api);
- if (Sem == &semPPCDoubleDoubleLegacy)
+ if (Sem == &APFloatBase::semPPCDoubleDoubleLegacy)
return initFromPPCDoubleDoubleLegacyAPInt(api);
- if (Sem == &semFloat8E5M2)
+ if (Sem == &APFloatBase::semFloat8E5M2)
return initFromFloat8E5M2APInt(api);
- if (Sem == &semFloat8E5M2FNUZ)
+ if (Sem == &APFloatBase::semFloat8E5M2FNUZ)
return initFromFloat8E5M2FNUZAPInt(api);
- if (Sem == &semFloat8E4M3)
+ if (Sem == &APFloatBase::semFloat8E4M3)
return initFromFloat8E4M3APInt(api);
- if (Sem == &semFloat8E4M3FN)
+ if (Sem == &APFloatBase::semFloat8E4M3FN)
return initFromFloat8E4M3FNAPInt(api);
- if (Sem == &semFloat8E4M3FNUZ)
+ if (Sem == &APFloatBase::semFloat8E4M3FNUZ)
return initFromFloat8E4M3FNUZAPInt(api);
- if (Sem == &semFloat8E4M3B11FNUZ)
+ if (Sem == &APFloatBase::semFloat8E4M3B11FNUZ)
return initFromFloat8E4M3B11FNUZAPInt(api);
- if (Sem == &semFloat8E3M4)
+ if (Sem == &APFloatBase::semFloat8E3M4)
return initFromFloat8E3M4APInt(api);
- if (Sem == &semFloatTF32)
+ if (Sem == &APFloatBase::semFloatTF32)
return initFromFloatTF32APInt(api);
- if (Sem == &semFloat8E8M0FNU)
+ if (Sem == &APFloatBase::semFloat8E8M0FNU)
return initFromFloat8E8M0FNUAPInt(api);
- if (Sem == &semFloat6E3M2FN)
+ if (Sem == &APFloatBase::semFloat6E3M2FN)
return initFromFloat6E3M2FNAPInt(api);
- if (Sem == &semFloat6E2M3FN)
+ if (Sem == &APFloatBase::semFloat6E2M3FN)
return initFromFloat6E2M3FNAPInt(api);
- if (Sem == &semFloat4E2M1FN)
+ if (Sem == &APFloatBase::semFloat4E2M1FN)
return initFromFloat4E2M1FNAPInt(api);
llvm_unreachable("unsupported semantics");
@@ -4202,11 +4182,11 @@ IEEEFloat::IEEEFloat(const fltSemantics &Sem, const APInt &API) {
}
IEEEFloat::IEEEFloat(float f) {
- initFromAPInt(&semIEEEsingle, APInt::floatToBits(f));
+ initFromAPInt(&APFloatBase::semIEEEsingle, APInt::floatToBits(f));
}
IEEEFloat::IEEEFloat(double d) {
- initFromAPInt(&semIEEEdouble, APInt::doubleToBits(d));
+ initFromAPInt(&APFloatBase::semIEEEdouble, APInt::doubleToBits(d));
}
namespace {
@@ -4815,38 +4795,40 @@ IEEEFloat frexp(const IEEEFloat &Val, int &Exp, roundingMode RM) {
DoubleAPFloat::DoubleAPFloat(const fltSemantics &S)
: Semantics(&S),
- Floats(new APFloat[2]{APFloat(semIEEEdouble), APFloat(semIEEEdouble)}) {
- assert(Semantics == &semPPCDoubleDouble);
+ Floats(new APFloat[2]{APFloat(APFloatBase::semIEEEdouble),
+ APFloat(APFloatBase::semIEEEdouble)}) {
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble);
}
DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, uninitializedTag)
- : Semantics(&S),
- Floats(new APFloat[2]{APFloat(semIEEEdouble, uninitialized),
- APFloat(semIEEEdouble, uninitialized)}) {
- assert(Semantics == &semPPCDoubleDouble);
+ : Semantics(&S), Floats(new APFloat[2]{
+ APFloat(APFloatBase::semIEEEdouble, uninitialized),
+ APFloat(APFloatBase::semIEEEdouble, uninitialized)}) {
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble);
}
DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, integerPart I)
- : Semantics(&S), Floats(new APFloat[2]{APFloat(semIEEEdouble, I),
- APFloat(semIEEEdouble)}) {
- assert(Semantics == &semPPCDoubleDouble);
+ : Semantics(&S),
+ Floats(new APFloat[2]{APFloat(APFloatBase::semIEEEdouble, I),
+ APFloat(APFloatBase::semIEEEdouble)}) {
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble);
}
DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, const APInt &I)
: Semantics(&S),
Floats(new APFloat[2]{
- APFloat(semIEEEdouble, APInt(64, I.getRawData()[0])),
- APFloat(semIEEEdouble, APInt(64, I.getRawData()[1]))}) {
- assert(Semantics == &semPPCDoubleDouble);
+ APFloat(APFloatBase::semIEEEdouble, APInt(64, I.getRawData()[0])),
+ APFloat(APFloatBase::semIEEEdouble, APInt(64, I.getRawData()[1]))}) {
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble);
}
DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, APFloat &&First,
APFloat &&Second)
: Semantics(&S),
Floats(new APFloat[2]{std::move(First), std::move(Second)}) {
- assert(Semantics == &semPPCDoubleDouble);
- assert(&Floats[0].getSemantics() == &semIEEEdouble);
- assert(&Floats[1].getSemantics() == &semIEEEdouble);
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble);
+ assert(&Floats[0].getSemantics() == &APFloatBase::semIEEEdouble);
+ assert(&Floats[1].getSemantics() == &APFloatBase::semIEEEdouble);
}
DoubleAPFloat::DoubleAPFloat(const DoubleAPFloat &RHS)
@@ -4854,14 +4836,14 @@ DoubleAPFloat::DoubleAPFloat(const DoubleAPFloat &RHS)
Floats(RHS.Floats ? new APFloat[2]{APFloat(RHS.Floats[0]),
APFloat(RHS.Floats[1])}
: nullptr) {
- assert(Semantics == &semPPCDoubleDouble);
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble);
}
DoubleAPFloat::DoubleAPFloat(DoubleAPFloat &&RHS)
: Semantics(RHS.Semantics), Floats(RHS.Floats) {
- RHS.Semantics = &semBogus;
+ RHS.Semantics = &APFloatBase::semBogus;
RHS.Floats = nullptr;
- assert(Semantics == &semPPCDoubleDouble);
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble);
}
DoubleAPFloat &DoubleAPFloat::operator=(const DoubleAPFloat &RHS) {
@@ -5009,12 +4991,12 @@ APFloat::opStatus DoubleAPFloat::addWithSpecial(const DoubleAPFloat &LHS,
APFloat A(LHS.Floats[0]), AA(LHS.Floats[1]), C(RHS.Floats[0]),
CC(RHS.Floats[1]);
- assert(&A.getSemantics() == &semIEEEdouble);
- assert(&AA.getSemantics() == &semIEEEdouble);
- assert(&C.getSemantics() == &semIEEEdouble);
- assert(&CC.getSemantics() == &semIEEEdouble);
- assert(&Out.Floats[0].getSemantics() == &semIEEEdouble);
- assert(&Out.Floats[1].getSemantics() == &semIEEEdouble);
+ assert(&A.getSemantics() == &APFloatBase::semIEEEdouble);
+ assert(&AA.getSemantics() == &APFloatBase::semIEEEdouble);
+ assert(&C.getSemantics() == &APFloatBase::semIEEEdouble);
+ assert(&CC.getSemantics() == &APFloatBase::semIEEEdouble);
+ assert(&Out.Floats[0].getSemantics() == &APFloatBase::semIEEEdouble);
+ assert(&Out.Floats[1].getSemantics() == &APFloatBase::semIEEEdouble);
return Out.addImpl(A, AA, C, CC, RM);
}
@@ -5119,28 +5101,32 @@ APFloat::opStatus DoubleAPFloat::multiply(const DoubleAPFloat &RHS,
APFloat::opStatus DoubleAPFloat::divide(const DoubleAPFloat &RHS,
APFloat::roundingMode RM) {
- assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
- APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
- auto Ret =
- Tmp.divide(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()), RM);
- *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+ "Unexpected Semantics");
+ APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt());
+ auto Ret = Tmp.divide(
+ APFloat(APFloatBase::semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()), RM);
+ *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt());
return Ret;
}
APFloat::opStatus DoubleAPFloat::remainder(const DoubleAPFloat &RHS) {
- assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
- APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
- auto Ret =
- Tmp.remainder(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()));
- *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+ "Unexpected Semantics");
+ APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt());
+ auto Ret = Tmp.remainder(
+ APFloat(APFloatBase::semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()));
+ *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt());
return Ret;
}
APFloat::opStatus DoubleAPFloat::mod(const DoubleAPFloat &RHS) {
- assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
- APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
- auto Ret = Tmp.mod(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()));
- *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+ "Unexpected Semantics");
+ APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt());
+ auto Ret = Tmp.mod(
+ APFloat(APFloatBase::semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()));
+ *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt());
return Ret;
}
@@ -5148,17 +5134,21 @@ APFloat::opStatus
DoubleAPFloat::fusedMultiplyAdd(const DoubleAPFloat &Multiplicand,
const DoubleAPFloat &Addend,
APFloat::roundingMode RM) {
- assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
- APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+ "Unexpected Semantics");
+ APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt());
auto Ret = Tmp.fusedMultiplyAdd(
- APFloat(semPPCDoubleDoubleLegacy, Multiplicand.bitcastToAPInt()),
- APFloat(semPPCDoubleDoubleLegacy, Addend.bitcastToAPInt()), RM);
- *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+ APFloat(APFloatBase::semPPCDoubleDoubleLegacy,
+ Multiplicand.bitcastToAPInt()),
+ APFloat(APFloatBase::semPPCDoubleDoubleLegacy, Addend.bitcastToAPInt()),
+ RM);
+ *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt());
return Ret;
}
APFloat::opStatus DoubleAPFloat::roundToIntegral(APFloat::roundingMode RM) {
- assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+ "Unexpected Semantics");
const APFloat &Hi = getFirst();
const APFloat &Lo = getSecond();
@@ -5309,22 +5299,28 @@ void DoubleAPFloat::makeZero(bool Neg) {
}
void DoubleAPFloat::makeLargest(bool Neg) {
- assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
- Floats[0] = APFloat(semIEEEdouble, APInt(64, 0x7fefffffffffffffull));
- Floats[1] = APFloat(semIEEEdouble, APInt(64, 0x7c8ffffffffffffeull));
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+ "Unexpected Semantics");
+ Floats[0] =
+ APFloat(APFloatBase::semIEEEdouble, APInt(64, 0x7fefffffffffffffull));
+ Floats[1] =
+ APFloat(APFloatBase::semIEEEdouble, APInt(64, 0x7c8ffffffffffffeull));
if (Neg)
changeSign();
}
void DoubleAPFloat::makeSmallest(bool Neg) {
- assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+ "Unexpected Semantics");
Floats[0].makeSmallest(Neg);
Floats[1].makeZero(/* Neg = */ false);
}
void DoubleAPFloat::makeSmallestNormalized(bool Neg) {
- assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
- Floats[0] = APFloat(semIEEEdouble, APInt(64, 0x0360000000000000ull));
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+ "Unexpected Semantics");
+ Floats[0] =
+ APFloat(APFloatBase::semIEEEdouble, APInt(64, 0x0360000000000000ull));
if (Neg)
Floats[0].changeSign();
Floats[1].makeZero(/* Neg = */ false);
@@ -5355,7 +5351,8 @@ hash_code hash_value(const DoubleAPFloat &Arg) {
}
APInt DoubleAPFloat::bitcastToAPInt() const {
- assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+ "Unexpected Semantics");
uint64_t Data[] = {
Floats[0].bitcastToAPInt().getRawData()[0],
Floats[1].bitcastToAPInt().getRawData()[0],
@@ -5365,10 +5362,11 @@ APInt DoubleAPFloat::bitcastToAPInt() const {
Expected<APFloat::opStatus> DoubleAPFloat::convertFromString(StringRef S,
roundingMode RM) {
- assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
- APFloat Tmp(semPPCDoubleDoubleLegacy);
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+ "Unexpected Semantics");
+ APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy);
auto Ret = Tmp.convertFromString(S, RM);
- *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+ *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt());
return Ret;
}
@@ -5379,7 +5377,8 @@ Expected<APFloat::opStatus> DoubleAPFloat::convertFromString(StringRef S,
// nextUp must choose the smallest output > input that follows these rules.
// nexDown must choose the largest output < input that follows these rules.
APFloat::opStatus DoubleAPFloat::next(bool nextDown) {
- assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+ "Unexpected Semantics");
// nextDown(x) = -nextUp(-x)
if (nextDown) {
changeSign();
@@ -5481,7 +5480,8 @@ APFloat::opStatus DoubleAPFloat::next(bool nextDown) {
APFloat::opStatus DoubleAPFloat::convertToSignExtendedInteger(
MutableArrayRef<integerPart> Input, unsigned int Width, bool IsSigned,
roundingMode RM, bool *IsExact) const {
- assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+ "Unexpected Semantics");
// If Hi is not finite, or Lo is zero, the value is entirely represented
// by Hi. Delegate to the simpler single-APFloat conversion.
@@ -5761,8 +5761,9 @@ unsigned int DoubleAPFloat::convertToHexString(char *DST,
unsigned int HexDigits,
bool UpperCase,
roundingMode RM) const {
- assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
- return APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt())
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+ "Unexpected Semantics");
+ return APFloat(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt())
.convertToHexString(DST, HexDigits, UpperCase, RM);
}
@@ -5799,7 +5800,8 @@ bool DoubleAPFloat::isLargest() const {
}
bool DoubleAPFloat::isInteger() const {
- assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+ "Unexpected Semantics");
return Floats[0].isInteger() && Floats[1].isInteger();
}
@@ -5807,8 +5809,9 @@ void DoubleAPFloat::toString(SmallVectorImpl<char> &Str,
unsigned FormatPrecision,
unsigned FormatMaxPadding,
bool TruncateZero) const {
- assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
- APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt())
+ assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+ "Unexpected Semantics");
+ APFloat(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt())
.toString(Str, FormatPrecision, FormatMaxPadding, TruncateZero);
}
@@ -5840,14 +5843,17 @@ int ilogb(const DoubleAPFloat &Arg) {
DoubleAPFloat scalbn(const DoubleAPFloat &Arg, int Exp,
APFloat::roundingMode RM) {
- assert(Arg.Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
- return DoubleAPFloat(semPPCDoubleDouble, scalbn(Arg.Floats[0], Exp, RM),
+ assert(Arg.Semantics == &APFloatBase::PPCDoubleDouble() &&
+ "Unexpected Semantics");
+ return DoubleAPFloat(APFloatBase::PPCDoubleDouble(),
+ scalbn(Arg.Floats[0], Exp, RM),
scalbn(Arg.Floats[1], Exp, RM));
}
DoubleAPFloat frexp(const DoubleAPFloat &Arg, int &Exp,
APFloat::roundingMode RM) {
- assert(Arg.Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+ assert(Arg.Semantics == &APFloatBase::PPCDoubleDouble() &&
+ "Unexpected Semantics");
// Get the unbiased exponent e of the number, where |Arg| = m * 2^e for m in
// [1.0, 2.0).
@@ -5943,7 +5949,8 @@ DoubleAPFloat frexp(const DoubleAPFloat &Arg, int &Exp,
}
APFloat First = scalbn(Hi, -Exp, RM);
- return DoubleAPFloat(semPPCDoubleDouble, std::move(First), std::move(Second));
+ return DoubleAPFloat(APFloatBase::PPCDoubleDouble(), std::move(First),
+ std::move(Second));
}
} // namespace detail
@@ -5955,9 +5962,8 @@ APFloat::Storage::Storage(IEEEFloat F, const fltSemantics &Semantics) {
}
if (usesLayout<DoubleAPFloat>(Semantics)) {
const fltSemantics& S = F.getSemantics();
- new (&Double)
- DoubleAPFloat(Semantics, APFloat(std::move(F), S),
- APFloat(semIEEEdouble));
+ new (&Double) DoubleAPFloat(Semantics, APFloat(std::move(F), S),
+ APFloat(APFloatBase::IEEEdouble()));
return;
}
llvm_unreachable("Unexpected semantics");
@@ -6065,8 +6071,9 @@ APFloat::opStatus APFloat::convert(const fltSemantics &ToSemantics,
return U.IEEE.convert(ToSemantics, RM, losesInfo);
if (usesLayout<IEEEFloat>(getSemantics()) &&
usesLayout<DoubleAPFloat>(ToSemantics)) {
- assert(&ToSemantics == &semPPCDoubleDouble);
- auto Ret = U.IEEE.convert(semPPCDoubleDoubleLegacy, RM, losesInfo);
+ assert(&ToSemantics == &APFloatBase::semPPCDoubleDouble);
+ auto Ret =
+ U.IEEE.convert(APFloatBase::semPPCDoubleDoubleLegacy, RM, losesInfo);
*this = APFloat(ToSemantics, U.IEEE.bitcastToAPInt());
return Ret;
}
@@ -6113,13 +6120,15 @@ APFloat::opStatus APFloat::convertToInteger(APSInt &result,
}
double APFloat::convertToDouble() const {
- if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEdouble)
+ if (&getSemantics() ==
+ (const llvm::fltSemantics *)&APFloatBase::semIEEEdouble)
return getIEEE().convertToDouble();
assert(isRepresentableBy(getSemantics(), semIEEEdouble) &&
"Float semantics is not representable by IEEEdouble");
APFloat Temp = *this;
bool LosesInfo;
- opStatus St = Temp.convert(semIEEEdouble, rmNearestTiesToEven, &LosesInfo);
+ opStatus St =
+ Temp.convert(APFloatBase::semIEEEdouble, rmNearestTiesToEven, &LosesInfo);
assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision");
(void)St;
return Temp.getIEEE().convertToDouble();
@@ -6127,13 +6136,14 @@ double APFloat::convertToDouble() const {
#ifdef HAS_IEE754_FLOAT128
float128 APFloat::convertToQuad() const {
- if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEquad)
+ if (&getSemantics() == (const llvm::fltSemantics *)&APFloatBase::semIEEEquad)
return getIEEE().convertToQuad();
assert(isRepresentableBy(getSemantics(), semIEEEquad) &&
"Float semantics is not representable by IEEEquad");
APFloat Temp = *this;
bool LosesInfo;
- opStatus St = Temp.convert(semIEEEquad, rmNearestTiesToEven, &LosesInfo);
+ opStatus St =
+ Temp.convert(APFloatBase::semIEEEquad, rmNearestTiesToEven, &LosesInfo);
assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision");
(void)St;
return Temp.getIEEE().convertToQuad();
@@ -6141,18 +6151,84 @@ float128 APFloat::convertToQuad() const {
#endif
float APFloat::convertToFloat() const {
- if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEsingle)
+ if (&getSemantics() ==
+ (const llvm::fltSemantics *)&APFloatBase::semIEEEsingle)
return getIEEE().convertToFloat();
assert(isRepresentableBy(getSemantics(), semIEEEsingle) &&
"Float semantics is not representable by IEEEsingle");
APFloat Temp = *this;
bool LosesInfo;
- opStatus St = Temp.convert(semIEEEsingle, rmNearestTiesToEven, &LosesInfo);
+ opStatus St =
+ Temp.convert(APFloatBase::semIEEEsingle, rmNearestTiesToEven, &LosesInfo);
assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision");
(void)St;
return Temp.getIEEE().convertToFloat();
}
+APFloat::Storage::~Storage() {
+ if (usesLayout<IEEEFloat>(*semantics)) {
+ IEEE.~IEEEFloat();
+ return;
+ }
+ if (usesLayout<DoubleAPFloat>(*semantics)) {
+ Double.~DoubleAPFloat();
+ return;
+ }
+ llvm_unreachable("Unexpected semantics");
+}
+
+APFloat::Storage::Storage(const APFloat::Storage &RHS) {
+ if (usesLayout<IEEEFloat>(*RHS.semantics)) {
+ new (this) IEEEFloat(RHS.IEEE);
+ return;
+ }
+ if (usesLayout<DoubleAPFloat>(*RHS.semantics)) {
+ new (this) DoubleAPFloat(RHS.Double);
+ return;
+ }
+ llvm_unreachable("Unexpected semantics");
+}
+
+APFloat::Storage::Storage(APFloat::Storage &&RHS) {
+ if (usesLayout<IEEEFloat>(*RHS.semantics)) {
+ new (this) IEEEFloat(std::move(RHS.IEEE));
+ return;
+ }
+ if (usesLayout<DoubleAPFloat>(*RHS.semantics)) {
+ new (this) DoubleAPFloat(std::move(RHS.Double));
+ return;
+ }
+ llvm_unreachable("Unexpected semantics");
+}
+
+APFloat::Storage &APFloat::Storage::operator=(const APFloat::Storage &RHS) {
+ if (usesLayout<IEEEFloat>(*semantics) &&
+ usesLayout<IEEEFloat>(*RHS.semantics)) {
+ IEEE = RHS.IEEE;
+ } else if (usesLayout<DoubleAPFloat>(*semantics) &&
+ usesLayout<DoubleAPFloat>(*RHS.semantics)) {
+ Double = RHS.Double;
+ } else if (this != &RHS) {
+ this->~Storage();
+ new (this) Storage(RHS);
+ }
+ return *this;
+}
+
+APFloat::Storage &APFloat::Storage::operator=(APFloat::Storage &&RHS) {
+ if (usesLayout<IEEEFloat>(*semantics) &&
+ usesLayout<IEEEFloat>(*RHS.semantics)) {
+ IEEE = std::move(RHS.IEEE);
+ } else if (usesLayout<DoubleAPFloat>(*semantics) &&
+ usesLayout<DoubleAPFloat>(*RHS.semantics)) {
+ Double = std::move(RHS.Double);
+ } else if (this != &RHS) {
+ this->~Storage();
+ new (this) Storage(std::move(RHS));
+ }
+ return *this;
+}
+
} // namespace llvm
#undef APFLOAT_DISPATCH_ON_SEMANTICS
diff --git a/llvm/lib/Support/SourceMgr.cpp b/llvm/lib/Support/SourceMgr.cpp
index f2bbaab..299615a 100644
--- a/llvm/lib/Support/SourceMgr.cpp
+++ b/llvm/lib/Support/SourceMgr.cpp
@@ -69,11 +69,11 @@ unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
ErrorOr<std::unique_ptr<MemoryBuffer>>
SourceMgr::OpenIncludeFile(const std::string &Filename,
std::string &IncludedFile) {
- if (!FS)
- reportFatalInternalError("Opening include file from SourceMgr without VFS");
+ auto GetFile = [this](StringRef Path) {
+ return FS ? FS->getBufferForFile(Path) : MemoryBuffer::getFile(Path);
+ };
- ErrorOr<std::unique_ptr<MemoryBuffer>> NewBufOrErr =
- FS->getBufferForFile(Filename);
+ ErrorOr<std::unique_ptr<MemoryBuffer>> NewBufOrErr = GetFile(Filename);
SmallString<64> Buffer(Filename);
// If the file didn't exist directly, see if it's in an include path.
@@ -81,7 +81,7 @@ SourceMgr::OpenIncludeFile(const std::string &Filename,
++i) {
Buffer = IncludeDirectories[i];
sys::path::append(Buffer, Filename);
- NewBufOrErr = FS->getBufferForFile(Buffer);
+ NewBufOrErr = GetFile(Buffer);
}
if (NewBufOrErr)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1b559a6..8ed4062 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -514,8 +514,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MVT::i64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
- setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
- Legal);
+ setOperationAction({ISD::ABS, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX},
+ MVT::i32, Legal);
setOperationAction(
{ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index d0ad120..b841171 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1488,6 +1488,12 @@ let AssemblerPredicate = isGFX12Plus in {
def : MnemonicAlias<"ds_load_tr_b64", "ds_load_tr8_b64">, Requires<[isGFX1250Plus]>;
def : MnemonicAlias<"ds_load_tr_b128", "ds_load_tr16_b128">, Requires<[isGFX1250Plus]>;
+// Additional aliases for ds load transpose instructions.
+def : MnemonicAlias<"ds_load_b64_tr_b8", "ds_load_tr8_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b128_tr_b16", "ds_load_tr16_b128">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b64_tr_b4", "ds_load_tr4_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b96_tr_b6", "ds_load_tr6_b96">, Requires<[isGFX125xOnly]>;
+
//===----------------------------------------------------------------------===//
// GFX11.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index e0375ea..e3f3aba 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -892,6 +892,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// have EXEC as implicit destination. Issue a warning if encoding for
// vdst is not EXEC.
if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3) &&
+ MCII->get(MI.getOpcode()).getNumDefs() == 0 &&
MCII->get(MI.getOpcode()).hasImplicitDefOfPhysReg(AMDGPU::EXEC)) {
auto ExecEncoding = MRI.getEncodingValue(AMDGPU::EXEC_LO);
if (Bytes_[0] != ExecEncoding)
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 6de59be..8ea64d1 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -3711,6 +3711,12 @@ defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "globa
defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>;
defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>;
+// Additional aliases for global load transpose instructions.
+def : MnemonicAlias<"global_load_b128_tr_b16", "global_load_tr16_b128">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b64_tr_b8", "global_load_tr8_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b64_tr_b4", "global_load_tr4_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b96_tr_b6", "global_load_tr6_b96">, Requires<[isGFX125xOnly]>;
+
defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>;
defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">;
defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 979a8b0..4b22c68 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include <algorithm>
+#include <array>
namespace llvm {
@@ -45,7 +46,7 @@ struct GCNRegPressure {
return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR];
}
- void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
+ void clear() { Value.fill(0); }
unsigned getNumRegs(RegKind Kind) const {
assert(Kind < TOTAL_KINDS);
@@ -127,9 +128,7 @@ struct GCNRegPressure {
bool less(const MachineFunction &MF, const GCNRegPressure &O,
unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
- bool operator==(const GCNRegPressure &O) const {
- return std::equal(&Value[0], &Value[ValueArraySize], O.Value);
- }
+ bool operator==(const GCNRegPressure &O) const { return Value == O.Value; }
bool operator!=(const GCNRegPressure &O) const {
return !(*this == O);
@@ -160,7 +159,7 @@ private:
/// Pressure for all register kinds (first all regular registers kinds, then
/// all tuple register kinds).
- unsigned Value[ValueArraySize];
+ std::array<unsigned, ValueArraySize> Value;
static unsigned getRegKind(const TargetRegisterClass *RC,
const SIRegisterInfo *STI);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d516330..50447f4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9072,6 +9072,67 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
MachineOperand &Src1 = Inst.getOperand(2);
const DebugLoc &DL = Inst.getDebugLoc();
+ if (ST.useRealTrue16Insts()) {
+ Register SrcReg0, SrcReg1;
+ if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
+ SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
+ } else {
+ SrcReg0 = Src0.getReg();
+ }
+
+ if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
+ SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
+ } else {
+ SrcReg1 = Src1.getReg();
+ }
+
+ bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
+ bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
+
+ auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
+ switch (Inst.getOpcode()) {
+ case AMDGPU::S_PACK_LL_B32_B16:
+ NewMI
+ .addReg(SrcReg0, 0,
+ isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, 0,
+ isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::hi16);
+ break;
+ case AMDGPU::S_PACK_LH_B32_B16:
+ NewMI
+ .addReg(SrcReg0, 0,
+ isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, 0, AMDGPU::hi16)
+ .addImm(AMDGPU::hi16);
+ break;
+ case AMDGPU::S_PACK_HL_B32_B16:
+ NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, 0,
+ isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::hi16);
+ break;
+ case AMDGPU::S_PACK_HH_B32_B16:
+ NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, 0, AMDGPU::hi16)
+ .addImm(AMDGPU::hi16);
+ break;
+ default:
+ llvm_unreachable("unhandled s_pack_* instruction");
+ }
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+ return;
+ }
+
switch (Inst.getOpcode()) {
case AMDGPU::S_PACK_LL_B32_B16: {
Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e979eeb..df27ec1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -879,6 +879,11 @@ public:
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
}
+ bool isMFMA(uint16_t Opcode) const {
+ return isMAI(Opcode) && Opcode != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ Opcode != AMDGPU::V_ACCVGPR_READ_B32_e64;
+ }
+
static bool isDOT(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;
}
@@ -895,6 +900,10 @@ public:
return isMFMA(MI) || isWMMA(MI) || isSWMMAC(MI);
}
+ bool isMFMAorWMMA(uint16_t Opcode) const {
+ return isMFMA(Opcode) || isWMMA(Opcode) || isSWMMAC(Opcode);
+ }
+
static bool isSWMMAC(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC;
}
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 01a40c1..7431e11 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -47,9 +47,6 @@ private:
const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
- // Check if the machine instruction being processed is a supported packed
- // instruction.
- bool isUnpackingSupportedInstr(MachineInstr &MI) const;
// Creates a list of packed instructions following an MFMA that are suitable
// for unpacking.
void collectUnpackingCandidates(MachineInstr &BeginMI,
@@ -454,23 +451,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
return true;
}
-// If support is extended to new operations, add tests in
-// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
-bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
- if (!TII->isNeverCoissue(MI))
- return false;
- unsigned Opcode = MI.getOpcode();
- switch (Opcode) {
- case AMDGPU::V_PK_ADD_F32:
- case AMDGPU::V_PK_MUL_F32:
- case AMDGPU::V_PK_FMA_F32:
- return true;
- default:
- return false;
- }
- llvm_unreachable("Fully covered switch");
-}
-
bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
unsigned OpCode = MI.getOpcode();
Register DstReg = MI.getOperand(0).getReg();
@@ -612,10 +592,13 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
MachineInstr &Instr = *I;
+ uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
+ bool IsUnpackable =
+ !(UnpackedOpCode == std::numeric_limits<uint16_t>::max());
if (Instr.isMetaInstruction())
continue;
if ((Instr.isTerminator()) ||
- (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
+ (TII->isNeverCoissue(Instr) && !IsUnpackable) ||
(SIInstrInfo::modifiesModeRegister(Instr) &&
Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
return;
@@ -639,7 +622,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
return;
}
- if (!isUnpackingSupportedInstr(Instr))
+ if (!IsUnpackable)
continue;
if (canUnpackingClobberRegister(Instr))
@@ -687,8 +670,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
bool IsHiBits) {
MachineBasicBlock &MBB = *I.getParent();
const DebugLoc &DL = I.getDebugLoc();
- const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
- const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
+ const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
+ const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
Register DstReg = I.getOperand(0).getReg();
unsigned OpCode = I.getOpcode();
Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
@@ -702,15 +685,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
NewMI.addDef(UnpackedDstReg); // vdst
- addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
- addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
+ addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
+ addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
- const MachineOperand *SrcMO3 =
+ const MachineOperand *SrcMO2 =
TII->getNamedOperand(I, AMDGPU::OpName::src2);
unsigned Src2Mods =
TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
- addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
+ addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
}
NewMI.addImm(ClampVal); // clamp
// Packed instructions do not support output modifiers. safe to assign them 0
@@ -787,9 +770,13 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
// TODO: Fold this into previous block, if possible. Evaluate and handle any
// side effects.
+
+ // Perform the extra MF scans only for supported archs
+ if (!ST.hasGFX940Insts())
+ return Changed;
for (MachineBasicBlock &MBB : MF) {
- // Unpack packed instructions overlapped by MFMAs. This allows the compiler
- // to co-issue unpacked instructions with MFMA
+ // Unpack packed instructions overlapped by MFMAs. This allows the
+ // compiler to co-issue unpacked instructions with MFMA
auto SchedModel = TII->getSchedModel();
SetVector<MachineInstr *> InstrsToUnpack;
for (auto &MI : make_early_inc_range(MBB.instrs())) {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 67ea2dd..35e1127 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21287,21 +21287,28 @@ bool ARMTargetLowering::useLoadStackGuardNode(const Module &M) const {
}
void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
+ // MSVC CRT provides functionalities for stack protection.
RTLIB::LibcallImpl SecurityCheckCookieLibcall =
getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
- if (SecurityCheckCookieLibcall == RTLIB::Unsupported)
- return TargetLowering::insertSSPDeclarations(M);
- // MSVC CRT has a global variable holding security cookie.
- M.getOrInsertGlobal("__security_cookie",
- PointerType::getUnqual(M.getContext()));
+ RTLIB::LibcallImpl SecurityCookieVar =
+ getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
+ if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
+ SecurityCookieVar != RTLIB::Unsupported) {
+ // MSVC CRT has a global variable holding security cookie.
+ M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
+ PointerType::getUnqual(M.getContext()));
- // MSVC CRT has a function to validate security cookie.
- FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
- getLibcallImplName(SecurityCheckCookieLibcall),
- Type::getVoidTy(M.getContext()), PointerType::getUnqual(M.getContext()));
- if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
- F->addParamAttr(0, Attribute::AttrKind::InReg);
+ // MSVC CRT has a function to validate security cookie.
+ FunctionCallee SecurityCheckCookie =
+ M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
+ Type::getVoidTy(M.getContext()),
+ PointerType::getUnqual(M.getContext()));
+ if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
+ F->addParamAttr(0, Attribute::AttrKind::InReg);
+ }
+
+ TargetLowering::insertSSPDeclarations(M);
}
Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 272c21f..2f1a7ad 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -749,7 +749,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setTruncStoreAction(VT, MVT::i1, Expand);
}
- // Disable generations of extload/truncstore for v2i16/v2i8. The generic
+ // Disable generations of extload/truncstore for v2i32/v2i16/v2i8. The generic
// expansion for these nodes when they are unaligned is incorrect if the
// type is a vector.
//
@@ -757,7 +757,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// TargetLowering::expandUnalignedLoad/Store.
setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
MVT::v2i8, Expand);
+ setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i32,
+ {MVT::v2i8, MVT::v2i16}, Expand);
setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
// Register custom handling for illegal type loads/stores. We'll try to custom
// lower almost all illegal types and logic in the lowering will discard cases
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 5ceb477..19992e6 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -695,6 +695,9 @@ def HasStdExtZvfbfa : Predicate<"Subtarget->hasStdExtZvfbfa()">,
def FeatureStdExtZvfbfmin
: RISCVExtension<1, 0, "Vector BF16 Converts", [FeatureStdExtZve32f]>;
+def HasStdExtZvfbfmin : Predicate<"Subtarget->hasStdExtZvfbfmin()">,
+ AssemblerPredicate<(all_of FeatureStdExtZvfbfmin),
+ "'Zvfbfmin' (Vector BF16 Converts)">;
def FeatureStdExtZvfbfwma
: RISCVExtension<1, 0, "Vector BF16 widening mul-add",
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index eb87558..169465e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -24830,7 +24830,8 @@ bool RISCVTargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
// instruction, as it is usually smaller than the alternative sequence.
// TODO: Add vector division?
bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
- return OptSize && !VT.isVector();
+ return OptSize && !VT.isVector() &&
+ VT.getSizeInBits() <= getMaxDivRemBitWidthSupported();
}
bool RISCVTargetLowering::preferScalarizeSplat(SDNode *N) const {
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 1b7cb9b..636e31c 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -699,7 +699,8 @@ public:
"Can't encode VTYPE for uninitialized or unknown");
if (TWiden != 0)
return RISCVVType::encodeXSfmmVType(SEW, TWiden, AltFmt);
- return RISCVVType::encodeVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic);
+ return RISCVVType::encodeVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic,
+ AltFmt);
}
bool hasSEWLMULRatioOnly() const { return SEWLMULRatioOnly; }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index ddb53a2..12f776b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3775,11 +3775,13 @@ std::string RISCVInstrInfo::createMIROperandComment(
#define CASE_VFMA_OPCODE_VV(OP) \
CASE_VFMA_OPCODE_LMULS_MF4(OP, VV, E16): \
+ case CASE_VFMA_OPCODE_LMULS_MF4(OP##_ALT, VV, E16): \
case CASE_VFMA_OPCODE_LMULS_MF2(OP, VV, E32): \
case CASE_VFMA_OPCODE_LMULS_M1(OP, VV, E64)
#define CASE_VFMA_SPLATS(OP) \
CASE_VFMA_OPCODE_LMULS_MF4(OP, VFPR16, E16): \
+ case CASE_VFMA_OPCODE_LMULS_MF4(OP##_ALT, VFPR16, E16): \
case CASE_VFMA_OPCODE_LMULS_MF2(OP, VFPR32, E32): \
case CASE_VFMA_OPCODE_LMULS_M1(OP, VFPR64, E64)
// clang-format on
@@ -4003,11 +4005,13 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
#define CASE_VFMA_CHANGE_OPCODE_VV(OLDOP, NEWOP) \
CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VV, E16) \
+ CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP##_ALT, NEWOP##_ALT, VV, E16) \
CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VV, E32) \
CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VV, E64)
#define CASE_VFMA_CHANGE_OPCODE_SPLATS(OLDOP, NEWOP) \
CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VFPR16, E16) \
+ CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP##_ALT, NEWOP##_ALT, VFPR16, E16) \
CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VFPR32, E32) \
CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VFPR64, E64)
// clang-format on
@@ -4469,6 +4473,20 @@ bool RISCVInstrInfo::simplifyInstruction(MachineInstr &MI) const {
CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M2, E32) \
CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4, E16) \
CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4, E32) \
+
+#define CASE_FP_WIDEOP_OPCODE_LMULS_ALT(OP) \
+ CASE_FP_WIDEOP_OPCODE_COMMON(OP, MF4, E16): \
+ case CASE_FP_WIDEOP_OPCODE_COMMON(OP, MF2, E16): \
+ case CASE_FP_WIDEOP_OPCODE_COMMON(OP, M1, E16): \
+ case CASE_FP_WIDEOP_OPCODE_COMMON(OP, M2, E16): \
+ case CASE_FP_WIDEOP_OPCODE_COMMON(OP, M4, E16)
+
+#define CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_ALT(OP) \
+ CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF4, E16) \
+ CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF2, E16) \
+ CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M1, E16) \
+ CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M2, E16) \
+ CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4, E16)
// clang-format on
MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
@@ -4478,6 +4496,8 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
switch (MI.getOpcode()) {
default:
return nullptr;
+ case CASE_FP_WIDEOP_OPCODE_LMULS_ALT(FWADD_ALT_WV):
+ case CASE_FP_WIDEOP_OPCODE_LMULS_ALT(FWSUB_ALT_WV):
case CASE_FP_WIDEOP_OPCODE_LMULS(FWADD_WV):
case CASE_FP_WIDEOP_OPCODE_LMULS(FWSUB_WV): {
assert(RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags) &&
@@ -4494,6 +4514,8 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
llvm_unreachable("Unexpected opcode");
CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS(FWADD_WV)
CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS(FWSUB_WV)
+ CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_ALT(FWADD_ALT_WV)
+ CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_ALT(FWSUB_ALT_WV)
}
// clang-format on
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 65865ce..eb3c9b0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -5862,20 +5862,6 @@ multiclass VPatConversionWF_VF<string intrinsic, string instruction,
}
}
-multiclass VPatConversionWF_VF_BF<string intrinsic, string instruction,
- bit isSEWAware = 0> {
- foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in
- {
- defvar fvti = fvtiToFWti.Vti;
- defvar fwti = fvtiToFWti.Wti;
- let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
- GetVTypePredicates<fwti>.Predicates) in
- defm : VPatConversion<intrinsic, instruction, "V",
- fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
- fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>;
- }
-}
-
multiclass VPatConversionVI_WF<string intrinsic, string instruction> {
foreach vtiToWti = AllWidenableIntToFloatVectors in {
defvar vti = vtiToWti.Vti;
@@ -5969,20 +5955,6 @@ multiclass VPatConversionVF_WF_RTZ<string intrinsic, string instruction,
}
}
-multiclass VPatConversionVF_WF_BF_RM<string intrinsic, string instruction,
- bit isSEWAware = 0> {
- foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
- defvar fvti = fvtiToFWti.Vti;
- defvar fwti = fvtiToFWti.Wti;
- let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
- GetVTypePredicates<fwti>.Predicates) in
- defm : VPatConversionRoundingMode<intrinsic, instruction, "W",
- fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW,
- fvti.LMul, fvti.RegClass, fwti.RegClass,
- isSEWAware>;
- }
-}
-
multiclass VPatCompare_VI<string intrinsic, string inst,
ImmLeaf ImmType> {
foreach vti = AllIntegerVectors in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
index 0be9eab..9358486 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
@@ -36,7 +36,7 @@ defm VFWMACCBF16_V : VWMAC_FV_V_F<"vfwmaccbf16", 0b111011>;
//===----------------------------------------------------------------------===//
// Pseudo instructions
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
+let Predicates = [HasStdExtZvfbfmin] in {
defm PseudoVFWCVTBF16_F_F : VPseudoVWCVTD_V;
defm PseudoVFNCVTBF16_F_F : VPseudoVNCVTD_W_RM;
}
@@ -44,10 +44,364 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
let mayRaiseFPException = true, Predicates = [HasStdExtZvfbfwma] in
defm PseudoVFWMACCBF16 : VPseudoVWMAC_VV_VF_BF_RM;
+defset list<VTypeInfoToWide> AllWidenableIntToBF16Vectors = {
+ def : VTypeInfoToWide<VI8MF8, VBF16MF4>;
+ def : VTypeInfoToWide<VI8MF4, VBF16MF2>;
+ def : VTypeInfoToWide<VI8MF2, VBF16M1>;
+ def : VTypeInfoToWide<VI8M1, VBF16M2>;
+ def : VTypeInfoToWide<VI8M2, VBF16M4>;
+ def : VTypeInfoToWide<VI8M4, VBF16M8>;
+}
+
+multiclass VPseudoVALU_VV_VF_RM_BF16 {
+ foreach m = MxListF in {
+ defm "" : VPseudoBinaryFV_VV_RM<m, 16/*sew*/>,
+ SchedBinary<"WriteVFALUV", "ReadVFALUV", "ReadVFALUV", m.MX,
+ 16/*sew*/, forcePassthruRead=true>;
+ }
+
+ defvar f = SCALAR_F16;
+ foreach m = f.MxList in {
+ defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
+ SchedBinary<"WriteVFALUF", "ReadVFALUV", "ReadVFALUF", m.MX,
+ f.SEW, forcePassthruRead=true>;
+ }
+}
+
+multiclass VPseudoVALU_VF_RM_BF16 {
+ defvar f = SCALAR_F16;
+ foreach m = f.MxList in {
+ defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
+ SchedBinary<"WriteVFALUF", "ReadVFALUV", "ReadVFALUF", m.MX,
+ f.SEW, forcePassthruRead=true>;
+ }
+}
+
+multiclass VPseudoVFWALU_VV_VF_RM_BF16 {
+ foreach m = MxListFW in {
+ defm "" : VPseudoBinaryW_VV_RM<m, sew=16>,
+ SchedBinary<"WriteVFWALUV", "ReadVFWALUV", "ReadVFWALUV", m.MX,
+ 16/*sew*/, forcePassthruRead=true>;
+ }
+
+ defvar f = SCALAR_F16;
+ foreach m = f.MxListFW in {
+ defm "" : VPseudoBinaryW_VF_RM<m, f, sew=f.SEW>,
+ SchedBinary<"WriteVFWALUF", "ReadVFWALUV", "ReadVFWALUF", m.MX,
+ f.SEW, forcePassthruRead=true>;
+ }
+}
+
+multiclass VPseudoVFWALU_WV_WF_RM_BF16 {
+ foreach m = MxListFW in {
+ defm "" : VPseudoBinaryW_WV_RM<m, sew=16>,
+ SchedBinary<"WriteVFWALUV", "ReadVFWALUV", "ReadVFWALUV", m.MX,
+ 16/*sew*/, forcePassthruRead=true>;
+ }
+ defvar f = SCALAR_F16;
+ foreach m = f.MxListFW in {
+ defm "" : VPseudoBinaryW_WF_RM<m, f, sew=f.SEW>,
+ SchedBinary<"WriteVFWALUF", "ReadVFWALUV", "ReadVFWALUF", m.MX,
+ f.SEW, forcePassthruRead=true>;
+ }
+}
+
+multiclass VPseudoVFMUL_VV_VF_RM_BF16 {
+ foreach m = MxListF in {
+ defm "" : VPseudoBinaryFV_VV_RM<m, 16/*sew*/>,
+ SchedBinary<"WriteVFMulV", "ReadVFMulV", "ReadVFMulV", m.MX,
+ 16/*sew*/, forcePassthruRead=true>;
+ }
+
+ defvar f = SCALAR_F16;
+ foreach m = f.MxList in {
+ defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
+ SchedBinary<"WriteVFMulF", "ReadVFMulV", "ReadVFMulF", m.MX,
+ f.SEW, forcePassthruRead=true>;
+ }
+}
+
+multiclass VPseudoVWMUL_VV_VF_RM_BF16 {
+ foreach m = MxListFW in {
+ defm "" : VPseudoBinaryW_VV_RM<m, sew=16>,
+ SchedBinary<"WriteVFWMulV", "ReadVFWMulV", "ReadVFWMulV", m.MX,
+ 16/*sew*/, forcePassthruRead=true>;
+ }
+
+ defvar f = SCALAR_F16;
+ foreach m = f.MxListFW in {
+ defm "" : VPseudoBinaryW_VF_RM<m, f, sew=f.SEW>,
+ SchedBinary<"WriteVFWMulF", "ReadVFWMulV", "ReadVFWMulF", m.MX,
+ f.SEW, forcePassthruRead=true>;
+ }
+}
+
+multiclass VPseudoVMAC_VV_VF_AAXA_RM_BF16 {
+ foreach m = MxListF in {
+ defm "" : VPseudoTernaryV_VV_AAXA_RM<m, 16/*sew*/>,
+ SchedTernary<"WriteVFMulAddV", "ReadVFMulAddV", "ReadVFMulAddV",
+ "ReadVFMulAddV", m.MX, 16/*sew*/>;
+ }
+
+ defvar f = SCALAR_F16;
+ foreach m = f.MxList in {
+ defm "" : VPseudoTernaryV_VF_AAXA_RM<m, f, f.SEW>,
+ SchedTernary<"WriteVFMulAddF", "ReadVFMulAddV", "ReadVFMulAddF",
+ "ReadVFMulAddV", m.MX, f.SEW>;
+ }
+}
+
+multiclass VPseudoVWMAC_VV_VF_RM_BF16 {
+ foreach m = MxListFW in {
+ defm "" : VPseudoTernaryW_VV_RM<m, sew=16>,
+ SchedTernary<"WriteVFWMulAddV", "ReadVFWMulAddV",
+ "ReadVFWMulAddV", "ReadVFWMulAddV", m.MX, 16/*sew*/>;
+ }
+
+ defvar f = SCALAR_F16;
+ foreach m = f.MxListFW in {
+ defm "" : VPseudoTernaryW_VF_RM<m, f, sew=f.SEW>,
+ SchedTernary<"WriteVFWMulAddF", "ReadVFWMulAddV",
+ "ReadVFWMulAddF", "ReadVFWMulAddV", m.MX, f.SEW>;
+ }
+}
+
+multiclass VPseudoVRCP_V_BF16 {
+ foreach m = MxListF in {
+ defvar mx = m.MX;
+ let VLMul = m.value in {
+ def "_V_" # mx # "_E16"
+ : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
+ SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, 16/*sew*/,
+ forcePassthruRead=true>;
+ def "_V_" # mx # "_E16_MASK"
+ : VPseudoUnaryMask<m.vrclass, m.vrclass>,
+ RISCVMaskedPseudo<MaskIdx = 2>,
+ SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, 16/*sew*/,
+ forcePassthruRead=true>;
+ }
+ }
+}
+
+multiclass VPseudoVRCP_V_RM_BF16 {
+ foreach m = MxListF in {
+ defvar mx = m.MX;
+ let VLMul = m.value in {
+ def "_V_" # mx # "_E16"
+ : VPseudoUnaryNoMaskRoundingMode<m.vrclass, m.vrclass>,
+ SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, 16/*sew*/,
+ forcePassthruRead=true>;
+ def "_V_" # mx # "_E16_MASK"
+ : VPseudoUnaryMaskRoundingMode<m.vrclass, m.vrclass>,
+ RISCVMaskedPseudo<MaskIdx = 2>,
+ SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, 16/*sew*/,
+ forcePassthruRead=true>;
+ }
+ }
+}
+
+multiclass VPseudoVMAX_VV_VF_BF16 {
+ foreach m = MxListF in {
+ defm "" : VPseudoBinaryV_VV<m, sew=16>,
+ SchedBinary<"WriteVFMinMaxV", "ReadVFMinMaxV", "ReadVFMinMaxV",
+ m.MX, 16/*sew*/, forcePassthruRead=true>;
+ }
+
+ defvar f = SCALAR_F16;
+ foreach m = f.MxList in {
+ defm "" : VPseudoBinaryV_VF<m, f, f.SEW>,
+ SchedBinary<"WriteVFMinMaxF", "ReadVFMinMaxV", "ReadVFMinMaxF",
+ m.MX, f.SEW, forcePassthruRead=true>;
+ }
+}
+
+multiclass VPseudoVSGNJ_VV_VF_BF16 {
+ foreach m = MxListF in {
+ defm "" : VPseudoBinaryV_VV<m, sew=16>,
+ SchedBinary<"WriteVFSgnjV", "ReadVFSgnjV", "ReadVFSgnjV", m.MX,
+ 16/*sew*/, forcePassthruRead=true>;
+ }
+
+ defvar f = SCALAR_F16;
+ foreach m = f.MxList in {
+ defm "" : VPseudoBinaryV_VF<m, f, f.SEW>,
+ SchedBinary<"WriteVFSgnjF", "ReadVFSgnjV", "ReadVFSgnjF", m.MX,
+ f.SEW, forcePassthruRead=true>;
+ }
+}
+
+multiclass VPseudoVWCVTF_V_BF16 {
+ defvar constraint = "@earlyclobber $rd";
+ foreach m = MxListW in
+ defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint, sew=8,
+ TargetConstraintType=3>,
+ SchedUnary<"WriteVFWCvtIToFV", "ReadVFWCvtIToFV", m.MX, 8/*sew*/,
+ forcePassthruRead=true>;
+}
+
+multiclass VPseudoVWCVTD_V_BF16 {
+ defvar constraint = "@earlyclobber $rd";
+ foreach m = MxListFW in
+ defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint, sew=16,
+ TargetConstraintType=3>,
+ SchedUnary<"WriteVFWCvtFToFV", "ReadVFWCvtFToFV", m.MX, 16/*sew*/,
+ forcePassthruRead=true>;
+}
+
+multiclass VPseudoVNCVTD_W_BF16 {
+ defvar constraint = "@earlyclobber $rd";
+ foreach m = MxListFW in
+ defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint, sew=16,
+ TargetConstraintType=2>,
+ SchedUnary<"WriteVFNCvtFToFV", "ReadVFNCvtFToFV", m.MX, 16/*sew*/,
+ forcePassthruRead=true>;
+}
+
+multiclass VPseudoVNCVTD_W_RM_BF16 {
+ defvar constraint = "@earlyclobber $rd";
+ foreach m = MxListFW in
+ defm _W : VPseudoConversionRoundingMode<m.vrclass, m.wvrclass, m,
+ constraint, sew=16,
+ TargetConstraintType=2>,
+ SchedUnary<"WriteVFNCvtFToFV", "ReadVFNCvtFToFV", m.MX, 16/*sew*/,
+ forcePassthruRead=true>;
+}
+
+let Predicates = [HasStdExtZvfbfa], AltFmtType = IS_ALTFMT in {
+let mayRaiseFPException = true in {
+defm PseudoVFADD_ALT : VPseudoVALU_VV_VF_RM_BF16;
+defm PseudoVFSUB_ALT : VPseudoVALU_VV_VF_RM_BF16;
+defm PseudoVFRSUB_ALT : VPseudoVALU_VF_RM_BF16;
+}
+
+let mayRaiseFPException = true in {
+defm PseudoVFWADD_ALT : VPseudoVFWALU_VV_VF_RM_BF16;
+defm PseudoVFWSUB_ALT : VPseudoVFWALU_VV_VF_RM_BF16;
+defm PseudoVFWADD_ALT : VPseudoVFWALU_WV_WF_RM_BF16;
+defm PseudoVFWSUB_ALT : VPseudoVFWALU_WV_WF_RM_BF16;
+}
+
+let mayRaiseFPException = true in
+defm PseudoVFMUL_ALT : VPseudoVFMUL_VV_VF_RM_BF16;
+
+let mayRaiseFPException = true in
+defm PseudoVFWMUL_ALT : VPseudoVWMUL_VV_VF_RM_BF16;
+
+let mayRaiseFPException = true in {
+defm PseudoVFMACC_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16;
+defm PseudoVFNMACC_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16;
+defm PseudoVFMSAC_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16;
+defm PseudoVFNMSAC_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16;
+defm PseudoVFMADD_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16;
+defm PseudoVFNMADD_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16;
+defm PseudoVFMSUB_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16;
+defm PseudoVFNMSUB_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16;
+}
+
+let mayRaiseFPException = true in {
+defm PseudoVFWMACC_ALT : VPseudoVWMAC_VV_VF_RM_BF16;
+defm PseudoVFWNMACC_ALT : VPseudoVWMAC_VV_VF_RM_BF16;
+defm PseudoVFWMSAC_ALT : VPseudoVWMAC_VV_VF_RM_BF16;
+defm PseudoVFWNMSAC_ALT : VPseudoVWMAC_VV_VF_RM_BF16;
+}
+
+let mayRaiseFPException = true in
+defm PseudoVFRSQRT7_ALT : VPseudoVRCP_V_BF16;
+
+let mayRaiseFPException = true in
+defm PseudoVFREC7_ALT : VPseudoVRCP_V_RM_BF16;
+
+let mayRaiseFPException = true in {
+defm PseudoVFMIN_ALT : VPseudoVMAX_VV_VF_BF16;
+defm PseudoVFMAX_ALT : VPseudoVMAX_VV_VF_BF16;
+}
+
+defm PseudoVFSGNJ_ALT : VPseudoVSGNJ_VV_VF_BF16;
+defm PseudoVFSGNJN_ALT : VPseudoVSGNJ_VV_VF_BF16;
+defm PseudoVFSGNJX_ALT : VPseudoVSGNJ_VV_VF_BF16;
+
+let mayRaiseFPException = true in {
+defm PseudoVMFEQ_ALT : VPseudoVCMPM_VV_VF;
+defm PseudoVMFNE_ALT : VPseudoVCMPM_VV_VF;
+defm PseudoVMFLT_ALT : VPseudoVCMPM_VV_VF;
+defm PseudoVMFLE_ALT : VPseudoVCMPM_VV_VF;
+defm PseudoVMFGT_ALT : VPseudoVCMPM_VF;
+defm PseudoVMFGE_ALT : VPseudoVCMPM_VF;
+}
+
+defm PseudoVFCLASS_ALT : VPseudoVCLS_V;
+
+defm PseudoVFMERGE_ALT : VPseudoVMRG_FM;
+
+defm PseudoVFMV_V_ALT : VPseudoVMV_F;
+
+let mayRaiseFPException = true in {
+defm PseudoVFWCVT_F_XU_ALT : VPseudoVWCVTF_V_BF16;
+defm PseudoVFWCVT_F_X_ALT : VPseudoVWCVTF_V_BF16;
+
+defm PseudoVFWCVT_F_F_ALT : VPseudoVWCVTD_V_BF16;
+} // mayRaiseFPException = true
+
+let mayRaiseFPException = true in {
+let hasSideEffects = 0, hasPostISelHook = 1 in {
+defm PseudoVFNCVT_XU_F_ALT : VPseudoVNCVTI_W_RM;
+defm PseudoVFNCVT_X_F_ALT : VPseudoVNCVTI_W_RM;
+}
+
+defm PseudoVFNCVT_RTZ_XU_F_ALT : VPseudoVNCVTI_W;
+defm PseudoVFNCVT_RTZ_X_F_ALT : VPseudoVNCVTI_W;
+
+defm PseudoVFNCVT_F_F_ALT : VPseudoVNCVTD_W_RM_BF16;
+
+defm PseudoVFNCVT_ROD_F_F_ALT : VPseudoVNCVTD_W_BF16;
+} // mayRaiseFPException = true
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+ defvar f = SCALAR_F16;
+ let HasSEWOp = 1, BaseInstr = VFMV_F_S in
+ def "PseudoVFMV_" # f.FX # "_S_ALT" :
+ RISCVVPseudo<(outs f.fprclass:$rd), (ins VR:$rs2, sew:$sew)>,
+ Sched<[WriteVMovFS, ReadVMovFS]>;
+ let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, isReMaterializable = 1,
+ Constraints = "$rd = $passthru" in
+ def "PseudoVFMV_S_" # f.FX # "_ALT" :
+ RISCVVPseudo<(outs VR:$rd),
+ (ins VR:$passthru, f.fprclass:$rs1, AVL:$vl, sew:$sew)>,
+ Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>;
+}
+
+defm PseudoVFSLIDE1UP_ALT : VPseudoVSLD1_VF<"@earlyclobber $rd">;
+defm PseudoVFSLIDE1DOWN_ALT : VPseudoVSLD1_VF;
+} // Predicates = [HasStdExtZvfbfa], AltFmtType = IS_ALTFMT
+
//===----------------------------------------------------------------------===//
// Patterns
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
+multiclass VPatConversionWF_VF_BF<string intrinsic, string instruction,
+ bit isSEWAware = 0> {
+ foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in
+ {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar fwti = fvtiToFWti.Wti;
+ defm : VPatConversion<intrinsic, instruction, "V",
+ fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
+ fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>;
+ }
+}
+
+multiclass VPatConversionVF_WF_BF_RM<string intrinsic, string instruction,
+ bit isSEWAware = 0> {
+ foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar fwti = fvtiToFWti.Wti;
+ defm : VPatConversionRoundingMode<intrinsic, instruction, "W",
+ fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW,
+ fvti.LMul, fvti.RegClass, fwti.RegClass,
+ isSEWAware>;
+ }
+}
+
+let Predicates = [HasStdExtZvfbfmin] in {
defm : VPatConversionWF_VF_BF<"int_riscv_vfwcvtbf16_f_f_v",
"PseudoVFWCVTBF16_F_F", isSEWAware=1>;
defm : VPatConversionVF_WF_BF_RM<"int_riscv_vfncvtbf16_f_f_w",
@@ -56,7 +410,6 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
defvar fvti = fvtiToFWti.Vti;
defvar fwti = fvtiToFWti.Wti;
- let Predicates = [HasVInstructionsBF16Minimal] in
def : Pat<(fwti.Vector (any_riscv_fpextend_vl
(fvti.Vector fvti.RegClass:$rs1),
(fvti.Mask VMV0:$vm),
@@ -66,18 +419,16 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
(fvti.Mask VMV0:$vm),
GPR:$vl, fvti.Log2SEW, TA_MA)>;
- let Predicates = [HasVInstructionsBF16Minimal] in
- def : Pat<(fvti.Vector (any_riscv_fpround_vl
- (fwti.Vector fwti.RegClass:$rs1),
- (fwti.Mask VMV0:$vm), VLOpFrag)),
- (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
- (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
- (fwti.Mask VMV0:$vm),
- // Value to indicate no rounding mode change in
- // RISCVInsertReadWriteCSR
- FRM_DYN,
- GPR:$vl, fvti.Log2SEW, TA_MA)>;
- let Predicates = [HasVInstructionsBF16Minimal] in
+ def : Pat<(fvti.Vector (any_riscv_fpround_vl
+ (fwti.Vector fwti.RegClass:$rs1),
+ (fwti.Mask VMV0:$vm), VLOpFrag)),
+ (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
+ (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
+ (fwti.Mask VMV0:$vm),
+ // Value to indicate no rounding mode change in
+ // RISCVInsertReadWriteCSR
+ FRM_DYN,
+ GPR:$vl, fvti.Log2SEW, TA_MA)>;
def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
(!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
(fvti.Vector (IMPLICIT_DEF)),
@@ -87,6 +438,130 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
FRM_DYN,
fvti.AVL, fvti.Log2SEW, TA_MA)>;
}
+
+ defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllBF16Vectors>;
+ defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
+ AllBF16Vectors, uimm5>;
+ defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
+ eew=16, vtilist=AllBF16Vectors>;
+ defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllBF16Vectors, uimm5>;
+ defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllBF16Vectors, uimm5>;
+
+ foreach fvti = AllBF16Vectors in {
+ defm : VPatBinaryCarryInTAIL<"int_riscv_vmerge", "PseudoVMERGE", "VVM",
+ fvti.Vector,
+ fvti.Vector, fvti.Vector, fvti.Mask,
+ fvti.Log2SEW, fvti.LMul, fvti.RegClass,
+ fvti.RegClass, fvti.RegClass>;
+ defm : VPatBinaryCarryInTAIL<"int_riscv_vfmerge", "PseudoVFMERGE",
+ "V"#fvti.ScalarSuffix#"M",
+ fvti.Vector,
+ fvti.Vector, fvti.Scalar, fvti.Mask,
+ fvti.Log2SEW, fvti.LMul, fvti.RegClass,
+ fvti.RegClass, fvti.ScalarRegClass>;
+ defvar instr = !cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX);
+ def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$passthru),
+ (fvti.Vector fvti.RegClass:$rs2),
+ (fvti.Scalar (fpimm0)),
+ (fvti.Mask VMV0:$vm), VLOpFrag)),
+ (instr fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0,
+ (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>;
+
+ defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+ def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), fvti.RegClass:$rs1,
+ fvti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
+ (fvti.Vector (IMPLICIT_DEF)),
+ fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask VMV0:$vm),
+ fvti.AVL, fvti.Log2SEW)>;
+
+ def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm),
+ (SplatFPOp (SelectScalarFPAsInt (XLenVT GPR:$imm))),
+ fvti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX)
+ (fvti.Vector (IMPLICIT_DEF)),
+ fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>;
+
+ def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm),
+ (SplatFPOp (fvti.Scalar fpimm0)),
+ fvti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
+ (fvti.Vector (IMPLICIT_DEF)),
+ fvti.RegClass:$rs2, 0, (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>;
+
+ def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm),
+ (SplatFPOp fvti.ScalarRegClass:$rs1),
+ fvti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
+ (fvti.Vector (IMPLICIT_DEF)),
+ fvti.RegClass:$rs2,
+ (fvti.Scalar fvti.ScalarRegClass:$rs1),
+ (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>;
+
+ def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm),
+ fvti.RegClass:$rs1,
+ fvti.RegClass:$rs2,
+ fvti.RegClass:$passthru,
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
+ fvti.RegClass:$passthru, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask VMV0:$vm),
+ GPR:$vl, fvti.Log2SEW)>;
+
+ def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm),
+ (SplatFPOp (SelectScalarFPAsInt (XLenVT GPR:$imm))),
+ fvti.RegClass:$rs2,
+ fvti.RegClass:$passthru,
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX)
+ fvti.RegClass:$passthru, fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask VMV0:$vm),
+ GPR:$vl, fvti.Log2SEW)>;
+
+
+ def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm),
+ (SplatFPOp (fvti.Scalar fpimm0)),
+ fvti.RegClass:$rs2,
+ fvti.RegClass:$passthru,
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
+ fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0, (fvti.Mask VMV0:$vm),
+ GPR:$vl, fvti.Log2SEW)>;
+
+ def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm),
+ (SplatFPOp fvti.ScalarRegClass:$rs1),
+ fvti.RegClass:$rs2,
+ fvti.RegClass:$passthru,
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
+ fvti.RegClass:$passthru, fvti.RegClass:$rs2,
+ (fvti.Scalar fvti.ScalarRegClass:$rs1),
+ (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>;
+
+ def : Pat<(fvti.Vector
+ (riscv_vrgather_vv_vl fvti.RegClass:$rs2,
+ (ivti.Vector fvti.RegClass:$rs1),
+ fvti.RegClass:$passthru,
+ (fvti.Mask VMV0:$vm),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVRGATHER_VV_"# fvti.LMul.MX#"_E"# fvti.SEW#"_MASK")
+ fvti.RegClass:$passthru, fvti.RegClass:$rs2, fvti.RegClass:$rs1,
+ (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>;
+ def : Pat<(fvti.Vector (riscv_vrgather_vx_vl fvti.RegClass:$rs2, GPR:$rs1,
+ fvti.RegClass:$passthru,
+ (fvti.Mask VMV0:$vm),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVRGATHER_VX_"# fvti.LMul.MX#"_MASK")
+ fvti.RegClass:$passthru, fvti.RegClass:$rs2, GPR:$rs1,
+ (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>;
+ def : Pat<(fvti.Vector
+ (riscv_vrgather_vx_vl fvti.RegClass:$rs2,
+ uimm5:$imm,
+ fvti.RegClass:$passthru,
+ (fvti.Mask VMV0:$vm),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVRGATHER_VI_"# fvti.LMul.MX#"_MASK")
+ fvti.RegClass:$passthru, fvti.RegClass:$rs2, uimm5:$imm,
+ (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>;
+ }
}
let Predicates = [HasStdExtZvfbfwma] in {
@@ -97,3 +572,224 @@ let Predicates = [HasStdExtZvfbfwma] in {
defm : VPatWidenFPMulAccSDNode_VV_VF_RM<"PseudoVFWMACCBF16",
AllWidenableBF16ToFloatVectors>;
}
+
+multiclass VPatConversionVI_VF_BF16<string intrinsic, string instruction> {
+ foreach fvti = AllBF16Vectors in {
+ defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+ let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
+ GetVTypePredicates<ivti>.Predicates) in
+ defm : VPatConversion<intrinsic, instruction, "V",
+ ivti.Vector, fvti.Vector, ivti.Mask, fvti.Log2SEW,
+ fvti.LMul, ivti.RegClass, fvti.RegClass>;
+ }
+}
+
+multiclass VPatConversionWF_VI_BF16<string intrinsic, string instruction,
+ bit isSEWAware = 0> {
+ foreach vtiToWti = AllWidenableIntToBF16Vectors in {
+ defvar vti = vtiToWti.Vti;
+ defvar fwti = vtiToWti.Wti;
+ let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
+ GetVTypePredicates<fwti>.Predicates) in
+ defm : VPatConversion<intrinsic, instruction, "V",
+ fwti.Vector, vti.Vector, fwti.Mask, vti.Log2SEW,
+ vti.LMul, fwti.RegClass, vti.RegClass, isSEWAware>;
+ }
+}
+
+multiclass VPatConversionWF_VF_BF16<string intrinsic, string instruction,
+ bit isSEWAware = 0> {
+ foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar fwti = fvtiToFWti.Wti;
+ let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates,
+ GetVTypeMinimalPredicates<fwti>.Predicates) in
+ defm : VPatConversion<intrinsic, instruction, "V",
+ fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
+ fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>;
+ }
+}
+
+multiclass VPatConversionVI_WF_BF16<string intrinsic, string instruction> {
+ foreach vtiToWti = AllWidenableIntToBF16Vectors in {
+ defvar vti = vtiToWti.Vti;
+ defvar fwti = vtiToWti.Wti;
+ let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
+ GetVTypePredicates<fwti>.Predicates) in
+ defm : VPatConversion<intrinsic, instruction, "W",
+ vti.Vector, fwti.Vector, vti.Mask, vti.Log2SEW,
+ vti.LMul, vti.RegClass, fwti.RegClass>;
+ }
+}
+
+multiclass VPatConversionVI_WF_RM_BF16<string intrinsic, string instruction> {
+ foreach vtiToWti = AllWidenableIntToBF16Vectors in {
+ defvar vti = vtiToWti.Vti;
+ defvar fwti = vtiToWti.Wti;
+ let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
+ GetVTypePredicates<fwti>.Predicates) in
+ defm : VPatConversionRoundingMode<intrinsic, instruction, "W",
+ vti.Vector, fwti.Vector, vti.Mask, vti.Log2SEW,
+ vti.LMul, vti.RegClass, fwti.RegClass>;
+ }
+}
+
+multiclass VPatConversionVF_WF_BF16<string intrinsic, string instruction,
+ bit isSEWAware = 0> {
+ foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar fwti = fvtiToFWti.Wti;
+ let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
+ GetVTypePredicates<fwti>.Predicates) in
+ defm : VPatConversion<intrinsic, instruction, "W",
+ fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW,
+ fvti.LMul, fvti.RegClass, fwti.RegClass, isSEWAware>;
+ }
+}
+
+let Predicates = [HasStdExtZvfbfa] in {
+defm : VPatBinaryV_VV_VX_RM<"int_riscv_vfadd", "PseudoVFADD_ALT",
+ AllBF16Vectors, isSEWAware = 1>;
+defm : VPatBinaryV_VV_VX_RM<"int_riscv_vfsub", "PseudoVFSUB_ALT",
+ AllBF16Vectors, isSEWAware = 1>;
+defm : VPatBinaryV_VX_RM<"int_riscv_vfrsub", "PseudoVFRSUB_ALT",
+ AllBF16Vectors, isSEWAware = 1>;
+defm : VPatBinaryW_VV_VX_RM<"int_riscv_vfwadd", "PseudoVFWADD_ALT",
+ AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatBinaryW_VV_VX_RM<"int_riscv_vfwsub", "PseudoVFWSUB_ALT",
+ AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatBinaryW_WV_WX_RM<"int_riscv_vfwadd_w", "PseudoVFWADD_ALT",
+ AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatBinaryW_WV_WX_RM<"int_riscv_vfwsub_w", "PseudoVFWSUB_ALT",
+ AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatBinaryV_VV_VX_RM<"int_riscv_vfmul", "PseudoVFMUL_ALT",
+ AllBF16Vectors, isSEWAware=1>;
+defm : VPatBinaryW_VV_VX_RM<"int_riscv_vfwmul", "PseudoVFWMUL_ALT",
+ AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfmacc", "PseudoVFMACC_ALT",
+ AllBF16Vectors, isSEWAware=1>;
+defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfnmacc", "PseudoVFNMACC_ALT",
+ AllBF16Vectors, isSEWAware=1>;
+defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfmsac", "PseudoVFMSAC_ALT",
+ AllBF16Vectors, isSEWAware=1>;
+defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfnmsac", "PseudoVFNMSAC_ALT",
+ AllBF16Vectors, isSEWAware=1>;
+defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfmadd", "PseudoVFMADD_ALT",
+ AllBF16Vectors, isSEWAware=1>;
+defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfnmadd", "PseudoVFNMADD_ALT",
+ AllBF16Vectors, isSEWAware=1>;
+defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfmsub", "PseudoVFMSUB_ALT",
+ AllBF16Vectors, isSEWAware=1>;
+defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfnmsub", "PseudoVFNMSUB_ALT",
+ AllBF16Vectors, isSEWAware=1>;
+defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwmacc", "PseudoVFWMACC_ALT",
+ AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwnmacc", "PseudoVFWNMACC_ALT",
+ AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwmsac", "PseudoVFWMSAC_ALT",
+ AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwnmsac", "PseudoVFWNMSAC_ALT",
+ AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatUnaryV_V<"int_riscv_vfrsqrt7", "PseudoVFRSQRT7_ALT",
+ AllBF16Vectors, isSEWAware=1>;
+defm : VPatUnaryV_V_RM<"int_riscv_vfrec7", "PseudoVFREC7_ALT",
+ AllBF16Vectors, isSEWAware=1>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfmin", "PseudoVFMIN_ALT",
+ AllBF16Vectors, isSEWAware=1>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfmax", "PseudoVFMAX_ALT",
+ AllBF16Vectors, isSEWAware=1>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfsgnj", "PseudoVFSGNJ_ALT",
+ AllBF16Vectors, isSEWAware=1>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfsgnjn", "PseudoVFSGNJN_ALT",
+ AllBF16Vectors, isSEWAware=1>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfsgnjx", "PseudoVFSGNJX_ALT",
+ AllBF16Vectors, isSEWAware=1>;
+defm : VPatBinaryM_VV_VX<"int_riscv_vmfeq", "PseudoVMFEQ_ALT", AllBF16Vectors>;
+defm : VPatBinaryM_VV_VX<"int_riscv_vmfle", "PseudoVMFLE_ALT", AllBF16Vectors>;
+defm : VPatBinaryM_VV_VX<"int_riscv_vmflt", "PseudoVMFLT_ALT", AllBF16Vectors>;
+defm : VPatBinaryM_VV_VX<"int_riscv_vmfne", "PseudoVMFNE_ALT", AllBF16Vectors>;
+defm : VPatBinaryM_VX<"int_riscv_vmfgt", "PseudoVMFGT_ALT", AllBF16Vectors>;
+defm : VPatBinaryM_VX<"int_riscv_vmfge", "PseudoVMFGE_ALT", AllBF16Vectors>;
+defm : VPatBinarySwappedM_VV<"int_riscv_vmfgt", "PseudoVMFLT_ALT", AllBF16Vectors>;
+defm : VPatBinarySwappedM_VV<"int_riscv_vmfge", "PseudoVMFLE_ALT", AllBF16Vectors>;
+defm : VPatConversionVI_VF_BF16<"int_riscv_vfclass", "PseudoVFCLASS_ALT">;
+foreach vti = AllBF16Vectors in {
+ let Predicates = GetVTypePredicates<vti>.Predicates in
+ defm : VPatBinaryCarryInTAIL<"int_riscv_vfmerge", "PseudoVFMERGE_ALT",
+ "V"#vti.ScalarSuffix#"M",
+ vti.Vector,
+ vti.Vector, vti.Scalar, vti.Mask,
+ vti.Log2SEW, vti.LMul, vti.RegClass,
+ vti.RegClass, vti.ScalarRegClass>;
+}
+defm : VPatConversionWF_VI_BF16<"int_riscv_vfwcvt_f_xu_v", "PseudoVFWCVT_F_XU_ALT",
+ isSEWAware=1>;
+defm : VPatConversionWF_VI_BF16<"int_riscv_vfwcvt_f_x_v", "PseudoVFWCVT_F_X_ALT",
+ isSEWAware=1>;
+defm : VPatConversionWF_VF_BF16<"int_riscv_vfwcvt_f_f_v", "PseudoVFWCVT_F_F_ALT",
+ isSEWAware=1>;
+defm : VPatConversionVI_WF_RM_BF16<"int_riscv_vfncvt_xu_f_w", "PseudoVFNCVT_XU_F_ALT">;
+defm : VPatConversionVI_WF_RM_BF16<"int_riscv_vfncvt_x_f_w", "PseudoVFNCVT_X_F_ALT">;
+defm : VPatConversionVI_WF_BF16<"int_riscv_vfncvt_rtz_xu_f_w", "PseudoVFNCVT_RTZ_XU_F_ALT">;
+defm : VPatConversionVI_WF_BF16<"int_riscv_vfncvt_rtz_x_f_w", "PseudoVFNCVT_RTZ_X_F_ALT">;
+defm : VPatConversionVF_WF_RM<"int_riscv_vfncvt_f_f_w", "PseudoVFNCVT_F_F_ALT",
+ AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatConversionVF_WF_BF16<"int_riscv_vfncvt_rod_f_f_w", "PseudoVFNCVT_ROD_F_F_ALT",
+ isSEWAware=1>;
+defm : VPatBinaryV_VX<"int_riscv_vfslide1up", "PseudoVFSLIDE1UP_ALT", AllBF16Vectors>;
+defm : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN_ALT", AllBF16Vectors>;
+
+foreach fvti = AllBF16Vectors in {
+ defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+ let Predicates = GetVTypePredicates<ivti>.Predicates in {
+ // 13.16. Vector Floating-Point Move Instruction
+ // If we're splatting fpimm0, use vmv.v.x vd, x0.
+ def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
+ fvti.Vector:$passthru, (fvti.Scalar (fpimm0)), VLOpFrag)),
+ (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
+ $passthru, 0, GPR:$vl, fvti.Log2SEW, TU_MU)>;
+ def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
+ fvti.Vector:$passthru, (fvti.Scalar (SelectScalarFPAsInt (XLenVT GPR:$imm))), VLOpFrag)),
+ (!cast<Instruction>("PseudoVMV_V_X_"#fvti.LMul.MX)
+ $passthru, GPR:$imm, GPR:$vl, fvti.Log2SEW, TU_MU)>;
+ }
+
+ let Predicates = GetVTypePredicates<fvti>.Predicates in {
+ def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
+ fvti.Vector:$passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)),
+ (!cast<Instruction>("PseudoVFMV_V_ALT_" # fvti.ScalarSuffix # "_" #
+ fvti.LMul.MX)
+ $passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2),
+ GPR:$vl, fvti.Log2SEW, TU_MU)>;
+ }
+}
+
+foreach vti = NoGroupBF16Vectors in {
+ let Predicates = GetVTypePredicates<vti>.Predicates in {
+ def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
+ (vti.Scalar (fpimm0)),
+ VLOpFrag)),
+ (PseudoVMV_S_X $passthru, (XLenVT X0), GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
+ (vti.Scalar (SelectScalarFPAsInt (XLenVT GPR:$imm))),
+ VLOpFrag)),
+ (PseudoVMV_S_X $passthru, GPR:$imm, GPR:$vl, vti.Log2SEW)>;
+ def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
+ vti.ScalarRegClass:$rs1,
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFMV_S_"#vti.ScalarSuffix#"_ALT")
+ vti.RegClass:$passthru,
+ (vti.Scalar vti.ScalarRegClass:$rs1), GPR:$vl, vti.Log2SEW)>;
+ }
+
+ defvar vfmv_f_s_inst = !cast<Instruction>(!strconcat("PseudoVFMV_",
+ vti.ScalarSuffix,
+ "_S_ALT"));
+ // Only pattern-match extract-element operations where the index is 0. Any
+ // other index will have been custom-lowered to slide the vector correctly
+ // into place.
+ let Predicates = GetVTypePredicates<vti>.Predicates in
+ def : Pat<(vti.Scalar (extractelt (vti.Vector vti.RegClass:$rs2), 0)),
+ (vfmv_f_s_inst vti.RegClass:$rs2, vti.Log2SEW)>;
+}
+} // Predicates = [HasStdExtZvfbfa]
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 6acf799..334db4b 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -288,9 +288,12 @@ public:
bool hasVInstructionsI64() const { return HasStdExtZve64x; }
bool hasVInstructionsF16Minimal() const { return HasStdExtZvfhmin; }
bool hasVInstructionsF16() const { return HasStdExtZvfh; }
- bool hasVInstructionsBF16Minimal() const { return HasStdExtZvfbfmin; }
+ bool hasVInstructionsBF16Minimal() const {
+ return HasStdExtZvfbfmin || HasStdExtZvfbfa;
+ }
bool hasVInstructionsF32() const { return HasStdExtZve32f; }
bool hasVInstructionsF64() const { return HasStdExtZve64d; }
+ bool hasVInstructionsBF16() const { return HasStdExtZvfbfa; }
// F16 and F64 both require F32.
bool hasVInstructionsAnyF() const { return hasVInstructionsF32(); }
bool hasVInstructionsFullMultiply() const { return HasStdExtV; }
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 56a6168..640b014 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -78,6 +78,8 @@ public:
void outputExecutionModeFromNumthreadsAttribute(
const MCRegister &Reg, const Attribute &Attr,
SPIRV::ExecutionMode::ExecutionMode EM);
+ void outputExecutionModeFromEnableMaximalReconvergenceAttr(
+ const MCRegister &Reg, const SPIRVSubtarget &ST);
void outputExecutionMode(const Module &M);
void outputAnnotations(const Module &M);
void outputModuleSections();
@@ -495,6 +497,20 @@ void SPIRVAsmPrinter::outputExecutionModeFromNumthreadsAttribute(
outputMCInst(Inst);
}
+void SPIRVAsmPrinter::outputExecutionModeFromEnableMaximalReconvergenceAttr(
+ const MCRegister &Reg, const SPIRVSubtarget &ST) {
+ assert(ST.canUseExtension(SPIRV::Extension::SPV_KHR_maximal_reconvergence) &&
+ "Function called when SPV_KHR_maximal_reconvergence is not enabled.");
+
+ MCInst Inst;
+ Inst.setOpcode(SPIRV::OpExecutionMode);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ unsigned EM =
+ static_cast<unsigned>(SPIRV::ExecutionMode::MaximallyReconvergesKHR);
+ Inst.addOperand(MCOperand::createImm(EM));
+ outputMCInst(Inst);
+}
+
void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
NamedMDNode *Node = M.getNamedMetadata("spirv.ExecutionMode");
if (Node) {
@@ -551,6 +567,10 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
if (Attribute Attr = F.getFnAttribute("hlsl.numthreads"); Attr.isValid())
outputExecutionModeFromNumthreadsAttribute(
FReg, Attr, SPIRV::ExecutionMode::LocalSize);
+ if (Attribute Attr = F.getFnAttribute("enable-maximal-reconvergence");
+ Attr.getValueAsBool()) {
+ outputExecutionModeFromEnableMaximalReconvergenceAttr(FReg, *ST);
+ }
if (MDNode *Node = F.getMetadata("work_group_size_hint"))
outputExecutionModeFromMDNode(FReg, Node,
SPIRV::ExecutionMode::LocalSizeHint, 3, 1);
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 5f3ed86..96f5dee 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -153,7 +153,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
SPIRV::Extension::Extension::
SPV_EXT_relaxed_printf_string_address_space},
{"SPV_INTEL_predicated_io",
- SPIRV::Extension::Extension::SPV_INTEL_predicated_io}};
+ SPIRV::Extension::Extension::SPV_INTEL_predicated_io},
+ {"SPV_KHR_maximal_reconvergence",
+ SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}};
bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
StringRef ArgValue,
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index c6c6182..a151fd2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -1392,19 +1392,19 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) {
Constant *AggrConst = nullptr;
Type *ResTy = nullptr;
if (auto *COp = dyn_cast<ConstantVector>(Op)) {
- AggrConst = cast<Constant>(COp);
+ AggrConst = COp;
ResTy = COp->getType();
} else if (auto *COp = dyn_cast<ConstantArray>(Op)) {
- AggrConst = cast<Constant>(COp);
+ AggrConst = COp;
ResTy = B.getInt32Ty();
} else if (auto *COp = dyn_cast<ConstantStruct>(Op)) {
- AggrConst = cast<Constant>(COp);
+ AggrConst = COp;
ResTy = B.getInt32Ty();
} else if (auto *COp = dyn_cast<ConstantDataArray>(Op)) {
- AggrConst = cast<Constant>(COp);
+ AggrConst = COp;
ResTy = B.getInt32Ty();
} else if (auto *COp = dyn_cast<ConstantAggregateZero>(Op)) {
- AggrConst = cast<Constant>(COp);
+ AggrConst = COp;
ResTy = Op->getType()->isVectorTy() ? COp->getType() : B.getInt32Ty();
}
if (AggrConst) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 5144fb1..61a0bbe 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1200,6 +1200,23 @@ void addOpAccessChainReqs(const MachineInstr &Instr,
return;
}
+ bool IsNonUniform =
+ hasNonUniformDecoration(Instr.getOperand(0).getReg(), MRI);
+
+ auto FirstIndexReg = Instr.getOperand(3).getReg();
+ bool FirstIndexIsConstant =
+ Subtarget.getInstrInfo()->isConstantInstr(*MRI.getVRegDef(FirstIndexReg));
+
+ if (StorageClass == SPIRV::StorageClass::StorageClass::StorageBuffer) {
+ if (IsNonUniform)
+ Handler.addRequirements(
+ SPIRV::Capability::StorageBufferArrayNonUniformIndexingEXT);
+ else if (!FirstIndexIsConstant)
+ Handler.addRequirements(
+ SPIRV::Capability::StorageBufferArrayDynamicIndexing);
+ return;
+ }
+
Register PointeeTypeReg = ResTypeInst->getOperand(2).getReg();
MachineInstr *PointeeType = MRI.getUniqueVRegDef(PointeeTypeReg);
if (PointeeType->getOpcode() != SPIRV::OpTypeImage &&
@@ -1208,27 +1225,25 @@ void addOpAccessChainReqs(const MachineInstr &Instr,
return;
}
- bool IsNonUniform =
- hasNonUniformDecoration(Instr.getOperand(0).getReg(), MRI);
if (isUniformTexelBuffer(PointeeType)) {
if (IsNonUniform)
Handler.addRequirements(
SPIRV::Capability::UniformTexelBufferArrayNonUniformIndexingEXT);
- else
+ else if (!FirstIndexIsConstant)
Handler.addRequirements(
SPIRV::Capability::UniformTexelBufferArrayDynamicIndexingEXT);
} else if (isInputAttachment(PointeeType)) {
if (IsNonUniform)
Handler.addRequirements(
SPIRV::Capability::InputAttachmentArrayNonUniformIndexingEXT);
- else
+ else if (!FirstIndexIsConstant)
Handler.addRequirements(
SPIRV::Capability::InputAttachmentArrayDynamicIndexingEXT);
} else if (isStorageTexelBuffer(PointeeType)) {
if (IsNonUniform)
Handler.addRequirements(
SPIRV::Capability::StorageTexelBufferArrayNonUniformIndexingEXT);
- else
+ else if (!FirstIndexIsConstant)
Handler.addRequirements(
SPIRV::Capability::StorageTexelBufferArrayDynamicIndexingEXT);
} else if (isSampledImage(PointeeType) ||
@@ -1237,14 +1252,14 @@ void addOpAccessChainReqs(const MachineInstr &Instr,
if (IsNonUniform)
Handler.addRequirements(
SPIRV::Capability::SampledImageArrayNonUniformIndexingEXT);
- else
+ else if (!FirstIndexIsConstant)
Handler.addRequirements(
SPIRV::Capability::SampledImageArrayDynamicIndexing);
} else if (isStorageImage(PointeeType)) {
if (IsNonUniform)
Handler.addRequirements(
SPIRV::Capability::StorageImageArrayNonUniformIndexingEXT);
- else
+ else if (!FirstIndexIsConstant)
Handler.addRequirements(
SPIRV::Capability::StorageImageArrayDynamicIndexing);
}
@@ -2155,6 +2170,9 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
SPIRV::OperandCategory::ExecutionModeOperand,
SPIRV::ExecutionMode::LocalSize, ST);
}
+ if (F.getFnAttribute("enable-maximal-reconvergence").getValueAsBool()) {
+ MAI.Reqs.addExtension(SPIRV::Extension::SPV_KHR_maximal_reconvergence);
+ }
if (F.getMetadata("work_group_size_hint"))
MAI.Reqs.getAndAddRequirements(
SPIRV::OperandCategory::ExecutionModeOperand,
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 2625642..7d08b29 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -386,6 +386,7 @@ defm SPV_KHR_float_controls2 : ExtensionOperand<124, [EnvVulkan, EnvOpenCL]>;
defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>;
defm SPV_KHR_bfloat16 : ExtensionOperand<126, [EnvVulkan, EnvOpenCL]>;
defm SPV_INTEL_predicated_io : ExtensionOperand<127, [EnvOpenCL]>;
+defm SPV_KHR_maximal_reconvergence : ExtensionOperand<128, [EnvVulkan]>;
//===----------------------------------------------------------------------===//
// Multiclass used to define Capabilities enum values and at the same time
@@ -698,7 +699,7 @@ defm IntersectionNV: ExecutionModelOperand<5314, [RayTracingNV]>;
defm AnyHitNV: ExecutionModelOperand<5315, [RayTracingNV]>;
defm ClosestHitNV: ExecutionModelOperand<5316, [RayTracingNV]>;
defm MissNV: ExecutionModelOperand<5317, [RayTracingNV]>;
-defm CallableNV: ExecutionModelOperand<5318, [RayTracingNV]>;
+defm CallableNV : ExecutionModelOperand<5318, [RayTracingNV]>;
//===----------------------------------------------------------------------===//
// Multiclass used to define MemoryModel enum values and at the same time
@@ -805,6 +806,7 @@ defm RoundingModeRTNINTEL : ExecutionModeOperand<5621, [RoundToInfinityINTEL]>;
defm FloatingPointModeALTINTEL : ExecutionModeOperand<5622, [FloatingPointModeINTEL]>;
defm FloatingPointModeIEEEINTEL : ExecutionModeOperand<5623, [FloatingPointModeINTEL]>;
defm FPFastMathDefault : ExecutionModeOperand<6028, [FloatControls2]>;
+defm MaximallyReconvergesKHR : ExecutionModeOperand<6023, [Shader]>;
//===----------------------------------------------------------------------===//
// Multiclass used to define StorageClass enum values and at the same time
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a0b64ff..b05d7c7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29755,65 +29755,30 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
const X86Subtarget &Subtarget,
SelectionDAG &DAG,
SDValue *Low = nullptr) {
- unsigned NumElts = VT.getVectorNumElements();
-
// For vXi8 we will unpack the low and high half of each 128 bit lane to widen
// to a vXi16 type. Do the multiplies, shift the results and pack the half
// lane results back together.
// We'll take different approaches for signed and unsigned.
- // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
- // and use pmullw to calculate the full 16-bit product.
+ // For unsigned we'll use punpcklbw/punpckhbw to zero extend the bytes to
+ // words and use pmullw to calculate the full 16-bit product.
// For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
// shift them left into the upper byte of each word. This allows us to use
// pmulhw to calculate the full 16-bit product. This trick means we don't
// need to sign extend the bytes to use pmullw.
-
- MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+ MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
SDValue Zero = DAG.getConstant(0, dl, VT);
- SDValue ALo, AHi;
+ SDValue ALo, AHi, BLo, BHi;
if (IsSigned) {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
- AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
- } else {
- ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
- AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
- }
-
- SDValue BLo, BHi;
- if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
- // If the RHS is a constant, manually unpackl/unpackh and extend.
- SmallVector<SDValue, 16> LoOps, HiOps;
- for (unsigned i = 0; i != NumElts; i += 16) {
- for (unsigned j = 0; j != 8; ++j) {
- SDValue LoOp = B.getOperand(i + j);
- SDValue HiOp = B.getOperand(i + j + 8);
-
- if (IsSigned) {
- LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
- HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
- LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
- DAG.getConstant(8, dl, MVT::i16));
- HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
- DAG.getConstant(8, dl, MVT::i16));
- } else {
- LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
- HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
- }
-
- LoOps.push_back(LoOp);
- HiOps.push_back(HiOp);
- }
- }
-
- BLo = DAG.getBuildVector(ExVT, dl, LoOps);
- BHi = DAG.getBuildVector(ExVT, dl, HiOps);
- } else if (IsSigned) {
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
+ AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
} else {
+ ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
+ AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
}
@@ -29826,7 +29791,7 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
if (Low)
*Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
- return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
+ return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf=*/true);
}
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
@@ -44848,10 +44813,16 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
}
case X86ISD::PCMPGT:
// icmp sgt(0, R) == ashr(R, BitWidth-1).
- // iff we only need the sign bit then we can use R directly.
- if (OriginalDemandedBits.isSignMask() &&
- ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
- return TLO.CombineTo(Op, Op.getOperand(1));
+ if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) {
+ // iff we only need the signbit then we can use R directly.
+ if (OriginalDemandedBits.isSignMask())
+ return TLO.CombineTo(Op, Op.getOperand(1));
+ // otherwise we just need R's signbit for the comparison.
+ APInt SignMask = APInt::getSignMask(BitWidth);
+ if (SimplifyDemandedBits(Op.getOperand(1), SignMask, OriginalDemandedElts,
+ Known, TLO, Depth + 1))
+ return true;
+ }
break;
case X86ISD::MOVMSK: {
SDValue Src = Op.getOperand(0);
@@ -47761,6 +47732,15 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
DL, DAG, Subtarget))
return V;
+ // If the sign bit is known then BLENDV can be folded away.
+ if (N->getOpcode() == X86ISD::BLENDV) {
+ KnownBits KnownCond = DAG.computeKnownBits(Cond);
+ if (KnownCond.isNegative())
+ return LHS;
+ if (KnownCond.isNonNegative())
+ return RHS;
+ }
+
if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
SmallVector<int, 64> CondMask;
if (createShuffleMaskFromVSELECT(CondMask, Cond,
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 6dd43b2..37d7772 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -606,16 +606,24 @@ Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
void X86TargetLowering::insertSSPDeclarations(Module &M) const {
// MSVC CRT provides functionalities for stack protection.
- if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
- Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
+ RTLIB::LibcallImpl SecurityCheckCookieLibcall =
+ getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
+
+ RTLIB::LibcallImpl SecurityCookieVar =
+ getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
+ if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
+ SecurityCookieVar != RTLIB::Unsupported) {
+ // MSVC CRT provides functionalities for stack protection.
// MSVC CRT has a global variable holding security cookie.
- M.getOrInsertGlobal("__security_cookie",
+ M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
PointerType::getUnqual(M.getContext()));
// MSVC CRT has a function to validate security cookie.
- FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
- "__security_check_cookie", Type::getVoidTy(M.getContext()),
- PointerType::getUnqual(M.getContext()));
+ FunctionCallee SecurityCheckCookie =
+ M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
+ Type::getVoidTy(M.getContext()),
+ PointerType::getUnqual(M.getContext()));
+
if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
F->setCallingConv(CallingConv::X86_FastCall);
F->addParamAttr(0, Attribute::AttrKind::InReg);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 1d2cd39..5c23f91 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10809,39 +10809,27 @@ void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
if (!ST.hasSSE1())
return;
- // PXOR is safe to use because it doesn't affect flags.
- BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ BuildMI(MBB, Iter, DL, get(X86::V_SET0), Reg);
} else if (X86::VR256RegClass.contains(Reg)) {
// YMM#
if (!ST.hasAVX())
return;
- // VPXOR is safe to use because it doesn't affect flags.
- BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ BuildMI(MBB, Iter, DL, get(X86::AVX_SET0), Reg);
} else if (X86::VR512RegClass.contains(Reg)) {
// ZMM#
if (!ST.hasAVX512())
return;
- // VPXORY is safe to use because it doesn't affect flags.
- BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ BuildMI(MBB, Iter, DL, get(X86::AVX512_512_SET0), Reg);
} else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
X86::VK16RegClass.contains(Reg)) {
if (!ST.hasVLX())
return;
- // KXOR is safe to use because it doesn't affect flags.
- unsigned Op = ST.hasBWI() ? X86::KXORQkk : X86::KXORWkk;
- BuildMI(MBB, Iter, DL, get(Op), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ unsigned Op = ST.hasBWI() ? X86::KSET0Q : X86::KSET0W;
+ BuildMI(MBB, Iter, DL, get(Op), Reg);
}
}
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 1fca466f..713d504 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1928,6 +1928,17 @@ static void addConstantComments(const MachineInstr *MI,
#define INSTR_CASE(Prefix, Instr, Suffix, Postfix) \
case X86::Prefix##Instr##Suffix##rm##Postfix:
+#define CASE_AVX512_ARITH_RM(Instr) \
+ INSTR_CASE(V, Instr, Z128, ) \
+ INSTR_CASE(V, Instr, Z128, k) \
+ INSTR_CASE(V, Instr, Z128, kz) \
+ INSTR_CASE(V, Instr, Z256, ) \
+ INSTR_CASE(V, Instr, Z256, k) \
+ INSTR_CASE(V, Instr, Z256, kz) \
+ INSTR_CASE(V, Instr, Z, ) \
+ INSTR_CASE(V, Instr, Z, k) \
+ INSTR_CASE(V, Instr, Z, kz)
+
#define CASE_ARITH_RM(Instr) \
INSTR_CASE(, Instr, , ) /* SSE */ \
INSTR_CASE(V, Instr, , ) /* AVX-128 */ \
@@ -1943,22 +1954,12 @@ static void addConstantComments(const MachineInstr *MI,
INSTR_CASE(V, Instr, Z, kz)
// TODO: Add additional instructions when useful.
- CASE_ARITH_RM(PMADDUBSW) {
- unsigned SrcIdx = getSrcIdx(MI, 1);
- if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
- std::string Comment;
- raw_string_ostream CS(Comment);
- unsigned VectorWidth =
- X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
- CS << "[";
- printConstant(C, VectorWidth, CS);
- CS << "]";
- OutStreamer.AddComment(CS.str());
- }
- break;
- }
-
+ CASE_ARITH_RM(PMADDUBSW)
CASE_ARITH_RM(PMADDWD)
+ CASE_ARITH_RM(PMULDQ)
+ CASE_ARITH_RM(PMULUDQ)
+ CASE_ARITH_RM(PMULLD)
+ CASE_AVX512_ARITH_RM(PMULLQ)
CASE_ARITH_RM(PMULLW)
CASE_ARITH_RM(PMULHW)
CASE_ARITH_RM(PMULHUW)
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index 9268df2..31126cc 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -887,7 +887,7 @@ void RISCVISAInfo::updateImplication() {
}
static constexpr StringLiteral CombineIntoExts[] = {
- {"b"}, {"zk"}, {"zkn"}, {"zks"}, {"zvkn"},
+ {"a"}, {"b"}, {"zk"}, {"zkn"}, {"zks"}, {"zvkn"},
{"zvknc"}, {"zvkng"}, {"zvks"}, {"zvksc"}, {"zvksg"},
};
diff --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
index b73a0ce..4645670 100644
--- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp
+++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
@@ -147,7 +147,7 @@ public:
private:
// Only add checks if the module has the cfguard=2 flag.
- int cfguard_module_flag = 0;
+ int CFGuardModuleFlag = 0;
StringRef GuardFnName;
Mechanism GuardMechanism = Mechanism::Check;
FunctionType *GuardFnType = nullptr;
@@ -162,9 +162,7 @@ public:
static char ID;
// Default constructor required for the INITIALIZE_PASS macro.
- CFGuard(CFGuardImpl::Mechanism M) : FunctionPass(ID), Impl(M) {
- initializeCFGuardPass(*PassRegistry::getPassRegistry());
- }
+ CFGuard(CFGuardImpl::Mechanism M) : FunctionPass(ID), Impl(M) {}
bool doInitialization(Module &M) override { return Impl.doInitialization(M); }
bool runOnFunction(Function &F) override { return Impl.runOnFunction(F); }
@@ -173,7 +171,6 @@ public:
} // end anonymous namespace
void CFGuardImpl::insertCFGuardCheck(CallBase *CB) {
-
assert(CB->getModule()->getTargetTriple().isOSWindows() &&
"Only applicable for Windows targets");
assert(CB->isIndirectCall() &&
@@ -202,7 +199,6 @@ void CFGuardImpl::insertCFGuardCheck(CallBase *CB) {
}
void CFGuardImpl::insertCFGuardDispatch(CallBase *CB) {
-
assert(CB->getModule()->getTargetTriple().isOSWindows() &&
"Only applicable for Windows targets");
assert(CB->isIndirectCall() &&
@@ -236,14 +232,13 @@ void CFGuardImpl::insertCFGuardDispatch(CallBase *CB) {
}
bool CFGuardImpl::doInitialization(Module &M) {
-
// Check if this module has the cfguard flag and read its value.
if (auto *MD =
mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("cfguard")))
- cfguard_module_flag = MD->getZExtValue();
+ CFGuardModuleFlag = MD->getZExtValue();
// Skip modules for which CFGuard checks have been disabled.
- if (cfguard_module_flag != 2)
+ if (CFGuardModuleFlag != 2)
return false;
// Set up prototypes for the guard check and dispatch functions.
@@ -264,9 +259,8 @@ bool CFGuardImpl::doInitialization(Module &M) {
}
bool CFGuardImpl::runOnFunction(Function &F) {
-
// Skip modules for which CFGuard checks have been disabled.
- if (cfguard_module_flag != 2)
+ if (CFGuardModuleFlag != 2)
return false;
SmallVector<CallBase *, 8> IndirectCalls;
@@ -286,19 +280,16 @@ bool CFGuardImpl::runOnFunction(Function &F) {
}
// If no checks are needed, return early.
- if (IndirectCalls.empty()) {
+ if (IndirectCalls.empty())
return false;
- }
// For each indirect call/invoke, add the appropriate dispatch or check.
if (GuardMechanism == Mechanism::Dispatch) {
- for (CallBase *CB : IndirectCalls) {
+ for (CallBase *CB : IndirectCalls)
insertCFGuardDispatch(CB);
- }
} else {
- for (CallBase *CB : IndirectCalls) {
+ for (CallBase *CB : IndirectCalls)
insertCFGuardCheck(CB);
- }
}
return true;
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 5066a99..894d83f 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -6150,3 +6150,42 @@ void MemProfContextDisambiguation::run(
IndexCallsiteContextGraph CCG(Index, isPrevailing);
CCG.process();
}
+
+// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
+// when we don't have an index that has recorded that we are linking with
+// allocation libraries containing the necessary APIs for downstream
+// transformations.
+PreservedAnalyses MemProfRemoveInfo::run(Module &M, ModuleAnalysisManager &AM) {
+ // The profile matcher applies hotness attributes directly for allocations,
+ // and those will cause us to generate calls to the hot/cold interfaces
+ // unconditionally. If supports-hot-cold-new was not enabled in the LTO
+ // link then assume we don't want these calls (e.g. not linking with
+ // the appropriate library, or otherwise trying to disable this behavior).
+ bool Changed = false;
+ for (auto &F : M) {
+ for (auto &BB : F) {
+ for (auto &I : BB) {
+ auto *CI = dyn_cast<CallBase>(&I);
+ if (!CI)
+ continue;
+ if (CI->hasFnAttr("memprof")) {
+ CI->removeFnAttr("memprof");
+ Changed = true;
+ }
+ if (!CI->hasMetadata(LLVMContext::MD_callsite)) {
+ assert(!CI->hasMetadata(LLVMContext::MD_memprof));
+ continue;
+ }
+ // Strip off all memprof metadata as it is no longer needed.
+ // Importantly, this avoids the addition of new memprof attributes
+ // after inlining propagation.
+ CI->setMetadata(LLVMContext::MD_memprof, nullptr);
+ CI->setMetadata(LLVMContext::MD_callsite, nullptr);
+ Changed = true;
+ }
+ }
+ }
+ if (!Changed)
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 07ad65c..fba1ccf 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1481,13 +1481,13 @@ Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp,
return new ICmpInst(Pred, Y, ConstantInt::get(SrcTy, C.logBase2()));
}
- if (Cmp.isEquality() && Trunc->hasOneUse()) {
+ if (Cmp.isEquality() && (Trunc->hasOneUse() || Trunc->hasNoUnsignedWrap())) {
// Canonicalize to a mask and wider compare if the wide type is suitable:
// (trunc X to i8) == C --> (X & 0xff) == (zext C)
if (!SrcTy->isVectorTy() && shouldChangeType(DstBits, SrcBits)) {
Constant *Mask =
ConstantInt::get(SrcTy, APInt::getLowBitsSet(SrcBits, DstBits));
- Value *And = Builder.CreateAnd(X, Mask);
+ Value *And = Trunc->hasNoUnsignedWrap() ? X : Builder.CreateAnd(X, Mask);
Constant *WideC = ConstantInt::get(SrcTy, C.zext(SrcBits));
return new ICmpInst(Pred, And, WideC);
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 09cb225..a8eb9b9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3757,6 +3757,10 @@ static Instruction *foldBitCeil(SelectInst &SI, IRBuilderBase &Builder,
// (x < y) ? -1 : zext(x > y)
// (x > y) ? 1 : sext(x != y)
// (x > y) ? 1 : sext(x < y)
+// (x == y) ? 0 : (x > y ? 1 : -1)
+// (x == y) ? 0 : (x < y ? -1 : 1)
+// Special case: x == C ? 0 : (x > C - 1 ? 1 : -1)
+// Special case: x == C ? 0 : (x < C + 1 ? -1 : 1)
// Into ucmp/scmp(x, y), where signedness is determined by the signedness
// of the comparison in the original sequence.
Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) {
@@ -3849,6 +3853,44 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) {
}
}
+ // Special cases with constants: x == C ? 0 : (x > C-1 ? 1 : -1)
+ if (Pred == ICmpInst::ICMP_EQ && match(TV, m_Zero())) {
+ const APInt *C;
+ if (match(RHS, m_APInt(C))) {
+ CmpPredicate InnerPred;
+ Value *InnerRHS;
+ const APInt *InnerTV, *InnerFV;
+ if (match(FV,
+ m_Select(m_ICmp(InnerPred, m_Specific(LHS), m_Value(InnerRHS)),
+ m_APInt(InnerTV), m_APInt(InnerFV)))) {
+
+ // x == C ? 0 : (x > C-1 ? 1 : -1)
+ if (ICmpInst::isGT(InnerPred) && InnerTV->isOne() &&
+ InnerFV->isAllOnes()) {
+ IsSigned = ICmpInst::isSigned(InnerPred);
+ bool CanSubOne = IsSigned ? !C->isMinSignedValue() : !C->isMinValue();
+ if (CanSubOne) {
+ APInt Cminus1 = *C - 1;
+ if (match(InnerRHS, m_SpecificInt(Cminus1)))
+ Replace = true;
+ }
+ }
+
+ // x == C ? 0 : (x < C+1 ? -1 : 1)
+ if (ICmpInst::isLT(InnerPred) && InnerTV->isAllOnes() &&
+ InnerFV->isOne()) {
+ IsSigned = ICmpInst::isSigned(InnerPred);
+ bool CanAddOne = IsSigned ? !C->isMaxSignedValue() : !C->isMaxValue();
+ if (CanAddOne) {
+ APInt Cplus1 = *C + 1;
+ if (match(InnerRHS, m_SpecificInt(Cplus1)))
+ Replace = true;
+ }
+ }
+ }
+ }
+ }
+
Intrinsic::ID IID = IsSigned ? Intrinsic::scmp : Intrinsic::ucmp;
if (Replace)
return replaceInstUsesWith(
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 6e17801..2646334 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -844,6 +844,7 @@ struct AddressSanitizer {
bool maybeInsertAsanInitAtFunctionEntry(Function &F);
bool maybeInsertDynamicShadowAtFunctionEntry(Function &F);
void markEscapedLocalAllocas(Function &F);
+ void markCatchParametersAsUninteresting(Function &F);
private:
friend struct FunctionStackPoisoner;
@@ -2997,6 +2998,22 @@ void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
}
}
}
+// Mitigation for https://github.com/google/sanitizers/issues/749
+// We don't instrument Windows catch-block parameters to avoid
+// interfering with exception handling assumptions.
+void AddressSanitizer::markCatchParametersAsUninteresting(Function &F) {
+ for (BasicBlock &BB : F) {
+ for (Instruction &I : BB) {
+ if (auto *CatchPad = dyn_cast<CatchPadInst>(&I)) {
+ // Mark the parameters to a catch-block as uninteresting to avoid
+ // instrumenting them.
+ for (Value *Operand : CatchPad->arg_operands())
+ if (auto *AI = dyn_cast<AllocaInst>(Operand))
+ ProcessedAllocas[AI] = false;
+ }
+ }
+ }
+}
bool AddressSanitizer::suppressInstrumentationSiteForDebug(int &Instrumented) {
bool ShouldInstrument =
@@ -3041,6 +3058,9 @@ bool AddressSanitizer::instrumentFunction(Function &F,
// can be passed to that intrinsic.
markEscapedLocalAllocas(F);
+ if (TargetTriple.isOSWindows())
+ markCatchParametersAsUninteresting(F);
+
// We want to instrument every address only once per basic block (unless there
// are calls between uses).
SmallPtrSet<Value *, 16> TempsToInstrument;
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 20733032..19eccb9 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -368,7 +368,7 @@ private:
Valid = false;
}
- bool reportInvalidCandidate(llvm::Statistic &Stat) const {
+ bool reportInvalidCandidate(Statistic &Stat) const {
using namespace ore;
assert(L && Preheader && "Fusion candidate not initialized properly!");
#if LLVM_ENABLE_STATS
@@ -445,6 +445,7 @@ struct FusionCandidateCompare {
"No dominance relationship between these fusion candidates!");
}
};
+} // namespace
using LoopVector = SmallVector<Loop *, 4>;
@@ -461,9 +462,15 @@ using LoopVector = SmallVector<Loop *, 4>;
using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
-#if !defined(NDEBUG)
-static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
- const FusionCandidate &FC) {
+#ifndef NDEBUG
+static void printLoopVector(const LoopVector &LV) {
+ dbgs() << "****************************\n";
+ for (const Loop *L : LV)
+ printLoop(*L, dbgs());
+ dbgs() << "****************************\n";
+}
+
+static raw_ostream &operator<<(raw_ostream &OS, const FusionCandidate &FC) {
if (FC.isValid())
OS << FC.Preheader->getName();
else
@@ -472,8 +479,8 @@ static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
return OS;
}
-static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
- const FusionCandidateSet &CandSet) {
+static raw_ostream &operator<<(raw_ostream &OS,
+ const FusionCandidateSet &CandSet) {
for (const FusionCandidate &FC : CandSet)
OS << FC << '\n';
@@ -489,7 +496,9 @@ printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
dbgs() << "****************************\n";
}
}
-#endif
+#endif // NDEBUG
+
+namespace {
/// Collect all loops in function at the same nest level, starting at the
/// outermost level.
@@ -550,15 +559,6 @@ private:
LoopsOnLevelTy LoopsOnLevel;
};
-#ifndef NDEBUG
-static void printLoopVector(const LoopVector &LV) {
- dbgs() << "****************************\n";
- for (auto *L : LV)
- printLoop(*L, dbgs());
- dbgs() << "****************************\n";
-}
-#endif
-
struct LoopFuser {
private:
// Sets of control flow equivalent fusion candidates for a given nest level.
@@ -1850,7 +1850,7 @@ private:
/// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
template <typename RemarkKind>
void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
- llvm::Statistic &Stat) {
+ Statistic &Stat) {
assert(FC0.Preheader && FC1.Preheader &&
"Expecting valid fusion candidates");
using namespace ore;
diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
index 7da8586..d827e64 100644
--- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -8,7 +8,6 @@
#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -217,9 +216,6 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
// Get the analysis results needed by loop passes.
MemorySSA *MSSA =
UseMemorySSA ? (&AM.getResult<MemorySSAAnalysis>(F).getMSSA()) : nullptr;
- BlockFrequencyInfo *BFI = UseBlockFrequencyInfo && F.hasProfileData()
- ? (&AM.getResult<BlockFrequencyAnalysis>(F))
- : nullptr;
LoopStandardAnalysisResults LAR = {AM.getResult<AAManager>(F),
AM.getResult<AssumptionAnalysis>(F),
AM.getResult<DominatorTreeAnalysis>(F),
@@ -227,7 +223,6 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
AM.getResult<ScalarEvolutionAnalysis>(F),
AM.getResult<TargetLibraryAnalysis>(F),
AM.getResult<TargetIRAnalysis>(F),
- BFI,
MSSA};
// Setup the loop analysis manager from its proxy. It is important that
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 7cae94eb..3487e81 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -97,6 +97,12 @@ static cl::opt<MatrixLayoutTy> MatrixLayout(
static cl::opt<bool> PrintAfterTransposeOpt("matrix-print-after-transpose-opt",
cl::init(false));
+static cl::opt<unsigned> SplitMatmulRemainderOverThreshold(
+ "matrix-split-matmul-remainder-over-threshold", cl::Hidden,
+ cl::desc("Illegal remainder vectors over this size in bits should be split "
+ "in the inner loop of matmul"),
+ cl::init(0));
+
/// Helper function to either return Scope, if it is a subprogram or the
/// attached subprogram for a local scope.
static DISubprogram *getSubprogram(DIScope *Scope) {
@@ -115,18 +121,16 @@ static bool isSplat(Value *V) {
/// Match any mul operation (fp or integer).
template <typename LTy, typename RTy>
-auto m_AnyMul(const LTy &L, const RTy &R) {
+static auto m_AnyMul(const LTy &L, const RTy &R) {
return m_CombineOr(m_Mul(L, R), m_FMul(L, R));
}
/// Match any add operation (fp or integer).
template <typename LTy, typename RTy>
-auto m_AnyAdd(const LTy &L, const RTy &R) {
+static auto m_AnyAdd(const LTy &L, const RTy &R) {
return m_CombineOr(m_Add(L, R), m_FAdd(L, R));
}
-namespace {
-
// Given an element pointer \p BasePtr to the start of a (sub) matrix, compute
// the start address of vector \p VecIdx with type (\p EltType x \p NumElements)
// assuming \p Stride elements between start two consecutive vectors.
@@ -167,9 +171,9 @@ namespace {
// v_2_0 |v_2_1 |v_2_2 |v_2_3
// v_3_0 {v_3_1 {v_3_2 v_3_3
//
-Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,
- unsigned NumElements, Type *EltType,
- IRBuilder<> &Builder) {
+static Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,
+ unsigned NumElements, Type *EltType,
+ IRBuilder<> &Builder) {
assert((!isa<ConstantInt>(Stride) ||
cast<ConstantInt>(Stride)->getZExtValue() >= NumElements) &&
@@ -338,6 +342,8 @@ computeShapeInfoForInst(Instruction *I,
return std::nullopt;
}
+namespace {
+
/// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics.
///
/// Currently, the lowering for each matrix intrinsic is done as follows:
@@ -371,7 +377,8 @@ class LowerMatrixIntrinsics {
LoopInfo *LI = nullptr;
OptimizationRemarkEmitter *ORE = nullptr;
- /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation.
+ /// Contains estimates of the number of operations (loads, stores, compute)
+ /// required to lower a matrix operation.
struct OpInfoTy {
/// Number of stores emitted to generate this matrix.
unsigned NumStores = 0;
@@ -1719,6 +1726,31 @@ public:
ToRemove.push_back(MatMul);
}
+ /// Given \p Remainder iterations of the the matmul inner loop,
+ /// potentially lower \p Blocksize that is used for the underlying
+ /// vector.
+ unsigned capBlockSize(unsigned BlockSize, unsigned Remainder, Type *EltType) {
+ if (BlockSize <= Remainder)
+ return BlockSize;
+
+ // If the remainder is also a legal type just use it.
+ auto *VecTy = FixedVectorType::get(EltType, Remainder);
+ if (TTI.isTypeLegal(VecTy))
+ return Remainder;
+
+ // Similarly, if the vector is small enough that we don't want
+ // to split further.
+ if (VecTy->getPrimitiveSizeInBits() <= SplitMatmulRemainderOverThreshold)
+ return Remainder;
+
+ // Gradually lower the vectorization factor to cover the
+ // remainder.
+ do {
+ BlockSize /= 2;
+ } while (BlockSize > Remainder);
+ return BlockSize;
+ }
+
/// Compute \p Result += \p A * \p B for input matrices with left-associating
/// addition.
///
@@ -1756,10 +1788,8 @@ public:
bool isSumZero = isa<ConstantAggregateZero>(Result.getColumn(J));
for (unsigned I = 0; I < R; I += BlockSize) {
- // Gradually lower the vectorization factor to cover the remainder.
- while (I + BlockSize > R)
- BlockSize /= 2;
-
+ // Lower block size to make sure we stay within bounds.
+ BlockSize = capBlockSize(BlockSize, R - I, Result.getElementType());
Value *Sum = IsTiled ? Result.extractVector(I, J, BlockSize, Builder)
: nullptr;
for (unsigned K = 0; K < M; ++K) {
@@ -1784,9 +1814,8 @@ public:
unsigned BlockSize = VF;
bool isSumZero = isa<ConstantAggregateZero>(Result.getRow(I));
for (unsigned J = 0; J < C; J += BlockSize) {
- // Gradually lower the vectorization factor to cover the remainder.
- while (J + BlockSize > C)
- BlockSize /= 2;
+ // Lower the vectorization factor to cover the remainder.
+ BlockSize = capBlockSize(BlockSize, C - J, Result.getElementType());
Value *Sum = nullptr;
for (unsigned K = 0; K < M; ++K) {
diff --git a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
index 30b27cb..7646624 100644
--- a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -107,9 +107,7 @@ PreservedAnalyses RegToMemPass::run(Function &F, FunctionAnalysisManager &AM) {
return PA;
}
-namespace llvm {
-
-void initializeRegToMemWrapperPassPass(PassRegistry &);
+namespace {
class RegToMemWrapperPass : public FunctionPass {
public:
@@ -136,7 +134,7 @@ public:
return N != 0 || Changed;
}
};
-} // namespace llvm
+} // namespace
INITIALIZE_PASS_BEGIN(RegToMemWrapperPass, "reg2mem", "", true, true)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index a692009..5c60fad 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -344,6 +344,12 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
uint64_t SliceSizeInBits, Instruction *OldInst,
Instruction *Inst, Value *Dest, Value *Value,
const DataLayout &DL) {
+ // If we want allocas to be migrated using this helper then we need to ensure
+ // that the BaseFragments map code still works. A simple solution would be
+ // to choose to always clone alloca dbg_assigns (rather than sometimes
+ // "stealing" them).
+ assert(!isa<AllocaInst>(Inst) && "Unexpected alloca");
+
auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
// Nothing to do if OldInst has no linked dbg.assign intrinsics.
if (DVRAssignMarkerRange.empty())
@@ -429,11 +435,22 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID);
}
- ::Value *NewValue = Value ? Value : DbgAssign->getValue();
- DbgVariableRecord *NewAssign = cast<DbgVariableRecord>(cast<DbgRecord *>(
- DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
- Dest, DIExpression::get(Expr->getContext(), {}),
- DbgAssign->getDebugLoc())));
+ DbgVariableRecord *NewAssign;
+ if (IsSplit) {
+ ::Value *NewValue = Value ? Value : DbgAssign->getValue();
+ NewAssign = cast<DbgVariableRecord>(cast<DbgRecord *>(
+ DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
+ Dest, DIExpression::get(Expr->getContext(), {}),
+ DbgAssign->getDebugLoc())));
+ } else {
+ // The store is not split, simply steal the existing dbg_assign.
+ NewAssign = DbgAssign;
+ NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs?
+ NewAssign->setAddress(Dest);
+ if (Value)
+ NewAssign->replaceVariableLocationOp(0u, Value);
+ assert(Expr == NewAssign->getExpression());
+ }
// If we've updated the value but the original dbg.assign has an arglist
// then kill it now - we can't use the requested new value.
@@ -464,9 +481,10 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
// noted as slightly offset (in code) from the store. In practice this
// should have little effect on the debugging experience due to the fact
// that all the split stores should get the same line number.
- NewAssign->moveBefore(DbgAssign->getIterator());
-
- NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
+ if (NewAssign != DbgAssign) {
+ NewAssign->moveBefore(DbgAssign->getIterator());
+ NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
+ }
LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
};
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index e4ba70d..5af6c96 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -27,7 +27,6 @@
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/MustExecute.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -3611,8 +3610,7 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
AssumptionCache &AC, AAResults &AA,
TargetTransformInfo &TTI, bool Trivial,
bool NonTrivial, ScalarEvolution *SE,
- MemorySSAUpdater *MSSAU, ProfileSummaryInfo *PSI,
- BlockFrequencyInfo *BFI, LPMUpdater &LoopUpdater) {
+ MemorySSAUpdater *MSSAU, LPMUpdater &LoopUpdater) {
assert(L.isRecursivelyLCSSAForm(DT, LI) &&
"Loops must be in LCSSA form before unswitching.");
@@ -3652,35 +3650,6 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
if (F->hasOptSize())
return false;
- // Returns true if Loop L's loop nest is cold, i.e. if the headers of L,
- // of the loops L is nested in, and of the loops nested in L are all cold.
- auto IsLoopNestCold = [&](const Loop *L) {
- // Check L and all of its parent loops.
- auto *Parent = L;
- while (Parent) {
- if (!PSI->isColdBlock(Parent->getHeader(), BFI))
- return false;
- Parent = Parent->getParentLoop();
- }
- // Next check all loops nested within L.
- SmallVector<const Loop *, 4> Worklist;
- llvm::append_range(Worklist, L->getSubLoops());
- while (!Worklist.empty()) {
- auto *CurLoop = Worklist.pop_back_val();
- if (!PSI->isColdBlock(CurLoop->getHeader(), BFI))
- return false;
- llvm::append_range(Worklist, CurLoop->getSubLoops());
- }
- return true;
- };
-
- // Skip cold loops in cold loop nests, as unswitching them brings little
- // benefit but increases the code size
- if (PSI && PSI->hasProfileSummary() && BFI && IsLoopNestCold(&L)) {
- LLVM_DEBUG(dbgs() << " Skip cold loop: " << L << "\n");
- return false;
- }
-
// Perform legality checks.
if (!isSafeForNoNTrivialUnswitching(L, LI))
return false;
@@ -3705,11 +3674,6 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
LPMUpdater &U) {
Function &F = *L.getHeader()->getParent();
(void)F;
- ProfileSummaryInfo *PSI = nullptr;
- if (auto OuterProxy =
- AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR)
- .getCachedResult<ModuleAnalysisManagerFunctionProxy>(F))
- PSI = OuterProxy->getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L
<< "\n");
@@ -3720,7 +3684,7 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
AR.MSSA->verifyMemorySSA();
}
if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.AA, AR.TTI, Trivial, NonTrivial,
- &AR.SE, MSSAU ? &*MSSAU : nullptr, PSI, AR.BFI, U))
+ &AR.SE, MSSAU ? &*MSSAU : nullptr, U))
return PreservedAnalyses::all();
if (AR.MSSA && VerifyMemorySSA)
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 9693ae6..b80c3c9 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -634,18 +634,10 @@ private:
/// Merge \p MergeWithV into \p IV and push \p V to the worklist, if \p IV
/// changes.
bool mergeInValue(ValueLatticeElement &IV, Value *V,
- ValueLatticeElement MergeWithV,
+ const ValueLatticeElement &MergeWithV,
ValueLatticeElement::MergeOptions Opts = {
/*MayIncludeUndef=*/false, /*CheckWiden=*/false});
- bool mergeInValue(Value *V, ValueLatticeElement MergeWithV,
- ValueLatticeElement::MergeOptions Opts = {
- /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) {
- assert(!V->getType()->isStructTy() &&
- "non-structs should use markConstant");
- return mergeInValue(ValueState[V], V, MergeWithV, Opts);
- }
-
/// getValueState - Return the ValueLatticeElement object that corresponds to
/// the value. This function handles the case when the value hasn't been seen
/// yet by properly seeding constants etc.
@@ -987,7 +979,7 @@ public:
void trackValueOfArgument(Argument *A) {
if (A->getType()->isStructTy())
return (void)markOverdefined(A);
- mergeInValue(A, getArgAttributeVL(A));
+ mergeInValue(ValueState[A], A, getArgAttributeVL(A));
}
bool isStructLatticeConstant(Function *F, StructType *STy);
@@ -1128,8 +1120,7 @@ bool SCCPInstVisitor::isStructLatticeConstant(Function *F, StructType *STy) {
for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
const auto &It = TrackedMultipleRetVals.find(std::make_pair(F, i));
assert(It != TrackedMultipleRetVals.end());
- ValueLatticeElement LV = It->second;
- if (!SCCPSolver::isConstant(LV))
+ if (!SCCPSolver::isConstant(It->second))
return false;
}
return true;
@@ -1160,7 +1151,7 @@ Constant *SCCPInstVisitor::getConstantOrNull(Value *V) const {
std::vector<Constant *> ConstVals;
auto *ST = cast<StructType>(V->getType());
for (unsigned I = 0, E = ST->getNumElements(); I != E; ++I) {
- ValueLatticeElement LV = LVs[I];
+ const ValueLatticeElement &LV = LVs[I];
ConstVals.push_back(SCCPSolver::isConstant(LV)
? getConstant(LV, ST->getElementType(I))
: UndefValue::get(ST->getElementType(I)));
@@ -1225,7 +1216,7 @@ void SCCPInstVisitor::visitInstruction(Instruction &I) {
}
bool SCCPInstVisitor::mergeInValue(ValueLatticeElement &IV, Value *V,
- ValueLatticeElement MergeWithV,
+ const ValueLatticeElement &MergeWithV,
ValueLatticeElement::MergeOptions Opts) {
if (IV.mergeIn(MergeWithV, Opts)) {
pushUsersToWorkList(V);
@@ -1264,7 +1255,7 @@ void SCCPInstVisitor::getFeasibleSuccessors(Instruction &TI,
return;
}
- ValueLatticeElement BCValue = getValueState(BI->getCondition());
+ const ValueLatticeElement &BCValue = getValueState(BI->getCondition());
ConstantInt *CI = getConstantInt(BCValue, BI->getCondition()->getType());
if (!CI) {
// Overdefined condition variables, and branches on unfoldable constant
@@ -1326,7 +1317,7 @@ void SCCPInstVisitor::getFeasibleSuccessors(Instruction &TI,
// the target as executable.
if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) {
// Casts are folded by visitCastInst.
- ValueLatticeElement IBRValue = getValueState(IBR->getAddress());
+ const ValueLatticeElement &IBRValue = getValueState(IBR->getAddress());
BlockAddress *Addr = dyn_cast_or_null<BlockAddress>(
getConstant(IBRValue, IBR->getAddress()->getType()));
if (!Addr) { // Overdefined or unknown condition?
@@ -1408,7 +1399,7 @@ void SCCPInstVisitor::visitPHINode(PHINode &PN) {
if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
continue;
- ValueLatticeElement IV = getValueState(PN.getIncomingValue(i));
+ const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i));
PhiState.mergeIn(IV);
NumActiveIncoming++;
if (PhiState.isOverdefined())
@@ -1420,10 +1411,10 @@ void SCCPInstVisitor::visitPHINode(PHINode &PN) {
// extensions to match the number of active incoming values. This helps to
// limit multiple extensions caused by the same incoming value, if other
// incoming values are equal.
- mergeInValue(&PN, PhiState,
+ ValueLatticeElement &PhiStateRef = ValueState[&PN];
+ mergeInValue(PhiStateRef, &PN, PhiState,
ValueLatticeElement::MergeOptions().setMaxWidenSteps(
NumActiveIncoming + 1));
- ValueLatticeElement &PhiStateRef = getValueState(&PN);
PhiStateRef.setNumRangeExtensions(
std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions()));
}
@@ -1481,7 +1472,7 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) {
}
}
- ValueLatticeElement OpSt = getValueState(I.getOperand(0));
+ const ValueLatticeElement &OpSt = getValueState(I.getOperand(0));
if (OpSt.isUnknownOrUndef())
return;
@@ -1496,9 +1487,9 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) {
if (I.getDestTy()->isIntOrIntVectorTy() &&
I.getSrcTy()->isIntOrIntVectorTy() &&
I.getOpcode() != Instruction::BitCast) {
- auto &LV = getValueState(&I);
ConstantRange OpRange =
OpSt.asConstantRange(I.getSrcTy(), /*UndefAllowed=*/false);
+ auto &LV = getValueState(&I);
Type *DestTy = I.getDestTy();
ConstantRange Res = ConstantRange::getEmpty(DestTy->getScalarSizeInBits());
@@ -1516,19 +1507,24 @@ void SCCPInstVisitor::handleExtractOfWithOverflow(ExtractValueInst &EVI,
const WithOverflowInst *WO,
unsigned Idx) {
Value *LHS = WO->getLHS(), *RHS = WO->getRHS();
- ValueLatticeElement L = getValueState(LHS);
- ValueLatticeElement R = getValueState(RHS);
+ Type *Ty = LHS->getType();
+
addAdditionalUser(LHS, &EVI);
addAdditionalUser(RHS, &EVI);
- if (L.isUnknownOrUndef() || R.isUnknownOrUndef())
- return; // Wait to resolve.
- Type *Ty = LHS->getType();
+ const ValueLatticeElement &L = getValueState(LHS);
+ if (L.isUnknownOrUndef())
+ return; // Wait to resolve.
ConstantRange LR = L.asConstantRange(Ty, /*UndefAllowed=*/false);
+
+ const ValueLatticeElement &R = getValueState(RHS);
+ if (R.isUnknownOrUndef())
+ return; // Wait to resolve.
+
ConstantRange RR = R.asConstantRange(Ty, /*UndefAllowed=*/false);
if (Idx == 0) {
ConstantRange Res = LR.binaryOp(WO->getBinaryOp(), RR);
- mergeInValue(&EVI, ValueLatticeElement::getRange(Res));
+ mergeInValue(ValueState[&EVI], &EVI, ValueLatticeElement::getRange(Res));
} else {
assert(Idx == 1 && "Index can only be 0 or 1");
ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
@@ -1560,7 +1556,7 @@ void SCCPInstVisitor::visitExtractValueInst(ExtractValueInst &EVI) {
if (auto *WO = dyn_cast<WithOverflowInst>(AggVal))
return handleExtractOfWithOverflow(EVI, WO, i);
ValueLatticeElement EltVal = getStructValueState(AggVal, i);
- mergeInValue(getValueState(&EVI), &EVI, EltVal);
+ mergeInValue(ValueState[&EVI], &EVI, EltVal);
} else {
// Otherwise, must be extracting from an array.
return (void)markOverdefined(&EVI);
@@ -1616,14 +1612,18 @@ void SCCPInstVisitor::visitSelectInst(SelectInst &I) {
if (ValueState[&I].isOverdefined())
return (void)markOverdefined(&I);
- ValueLatticeElement CondValue = getValueState(I.getCondition());
+ const ValueLatticeElement &CondValue = getValueState(I.getCondition());
if (CondValue.isUnknownOrUndef())
return;
if (ConstantInt *CondCB =
getConstantInt(CondValue, I.getCondition()->getType())) {
Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue();
- mergeInValue(&I, getValueState(OpVal));
+ const ValueLatticeElement &OpValState = getValueState(OpVal);
+ // Safety: ValueState[&I] doesn't invalidate OpValState since it is already
+ // in the map.
+ assert(ValueState.contains(&I) && "&I is not in ValueState map.");
+ mergeInValue(ValueState[&I], &I, OpValState);
return;
}
@@ -1721,7 +1721,7 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) {
// being a special floating value.
ValueLatticeElement NewV;
NewV.markConstant(C, /*MayIncludeUndef=*/true);
- return (void)mergeInValue(&I, NewV);
+ return (void)mergeInValue(ValueState[&I], &I, NewV);
}
}
@@ -1741,7 +1741,7 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) {
R = A.overflowingBinaryOp(BO->getOpcode(), B, OBO->getNoWrapKind());
else
R = A.binaryOp(BO->getOpcode(), B);
- mergeInValue(&I, ValueLatticeElement::getRange(R));
+ mergeInValue(ValueState[&I], &I, ValueLatticeElement::getRange(R));
// TODO: Currently we do not exploit special values that produce something
// better than overdefined with an overdefined operand for vector or floating
@@ -1767,7 +1767,7 @@ void SCCPInstVisitor::visitCmpInst(CmpInst &I) {
if (C) {
ValueLatticeElement CV;
CV.markConstant(C);
- mergeInValue(&I, CV);
+ mergeInValue(ValueState[&I], &I, CV);
return;
}
@@ -1802,7 +1802,7 @@ void SCCPInstVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
Operands.reserve(I.getNumOperands());
for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
- ValueLatticeElement State = getValueState(I.getOperand(i));
+ const ValueLatticeElement &State = getValueState(I.getOperand(i));
if (State.isUnknownOrUndef())
return; // Operands are not resolved yet.
@@ -1881,14 +1881,13 @@ void SCCPInstVisitor::visitLoadInst(LoadInst &I) {
if (ValueState[&I].isOverdefined())
return (void)markOverdefined(&I);
- ValueLatticeElement PtrVal = getValueState(I.getOperand(0));
+ const ValueLatticeElement &PtrVal = getValueState(I.getOperand(0));
if (PtrVal.isUnknownOrUndef())
return; // The pointer is not resolved yet!
- ValueLatticeElement &IV = ValueState[&I];
-
if (SCCPSolver::isConstant(PtrVal)) {
Constant *Ptr = getConstant(PtrVal, I.getOperand(0)->getType());
+ ValueLatticeElement &IV = ValueState[&I];
// load null is undefined.
if (isa<ConstantPointerNull>(Ptr)) {
@@ -1916,7 +1915,7 @@ void SCCPInstVisitor::visitLoadInst(LoadInst &I) {
}
// Fall back to metadata.
- mergeInValue(&I, getValueFromMetadata(&I));
+ mergeInValue(ValueState[&I], &I, getValueFromMetadata(&I));
}
void SCCPInstVisitor::visitCallBase(CallBase &CB) {
@@ -1944,7 +1943,7 @@ void SCCPInstVisitor::handleCallOverdefined(CallBase &CB) {
return markOverdefined(&CB); // Can't handle struct args.
if (A.get()->getType()->isMetadataTy())
continue; // Carried in CB, not allowed in Operands.
- ValueLatticeElement State = getValueState(A);
+ const ValueLatticeElement &State = getValueState(A);
if (State.isUnknownOrUndef())
return; // Operands are not resolved yet.
@@ -1964,7 +1963,7 @@ void SCCPInstVisitor::handleCallOverdefined(CallBase &CB) {
}
// Fall back to metadata.
- mergeInValue(&CB, getValueFromMetadata(&CB));
+ mergeInValue(ValueState[&CB], &CB, getValueFromMetadata(&CB));
}
void SCCPInstVisitor::handleCallArguments(CallBase &CB) {
@@ -1992,10 +1991,11 @@ void SCCPInstVisitor::handleCallArguments(CallBase &CB) {
mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg,
getMaxWidenStepsOpts());
}
- } else
- mergeInValue(&*AI,
- getValueState(*CAI).intersect(getArgAttributeVL(&*AI)),
- getMaxWidenStepsOpts());
+ } else {
+ ValueLatticeElement CallArg =
+ getValueState(*CAI).intersect(getArgAttributeVL(&*AI));
+ mergeInValue(ValueState[&*AI], &*AI, CallArg, getMaxWidenStepsOpts());
+ }
}
}
}
@@ -2076,7 +2076,8 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
if (II->getIntrinsicID() == Intrinsic::vscale) {
unsigned BitWidth = CB.getType()->getScalarSizeInBits();
const ConstantRange Result = getVScaleRange(II->getFunction(), BitWidth);
- return (void)mergeInValue(II, ValueLatticeElement::getRange(Result));
+ return (void)mergeInValue(ValueState[II], II,
+ ValueLatticeElement::getRange(Result));
}
if (ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) {
@@ -2094,7 +2095,8 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
ConstantRange Result =
ConstantRange::intrinsic(II->getIntrinsicID(), OpRanges);
- return (void)mergeInValue(II, ValueLatticeElement::getRange(Result));
+ return (void)mergeInValue(ValueState[II], II,
+ ValueLatticeElement::getRange(Result));
}
}
@@ -2121,7 +2123,7 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
return handleCallOverdefined(CB); // Not tracking this callee.
// If so, propagate the return value of the callee into this call result.
- mergeInValue(&CB, TFRVI->second, getMaxWidenStepsOpts());
+ mergeInValue(ValueState[&CB], &CB, TFRVI->second, getMaxWidenStepsOpts());
}
}
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 88af2cf..9cd52da 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2242,8 +2242,49 @@ public:
/// may not be necessary.
bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
- Align Alignment, const int64_t Diff, Value *Ptr0,
- Value *PtrN, StridedPtrInfo &SPtrInfo) const;
+ Align Alignment, const int64_t Diff,
+ const size_t Sz) const;
+
+ /// Return true if an array of scalar loads can be replaced with a strided
+ /// load (with constant stride).
+ ///
+ /// TODO:
+ /// It is possible that the load gets "widened". Suppose that originally each
+ /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
+ /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
+ /// ...
+ /// %b + 0 * %s + (w - 1)
+ ///
+ /// %b + 1 * %s + 0
+ /// %b + 1 * %s + 1
+ /// %b + 1 * %s + 2
+ /// ...
+ /// %b + 1 * %s + (w - 1)
+ /// ...
+ ///
+ /// %b + (n - 1) * %s + 0
+ /// %b + (n - 1) * %s + 1
+ /// %b + (n - 1) * %s + 2
+ /// ...
+ /// %b + (n - 1) * %s + (w - 1)
+ ///
+ /// In this case we will generate a strided load of type `<n x (k * w)>`.
+ ///
+ /// \param PointerOps list of pointer arguments of loads.
+ /// \param ElemTy original scalar type of loads.
+ /// \param Alignment alignment of the first load.
+ /// \param SortedIndices is the order of PointerOps as returned by
+ /// `sortPtrAccesses`
+ /// \param Diff Pointer difference between the lowest and the highes pointer
+ /// in `PointerOps` as returned by `getPointersDiff`.
+ /// \param Ptr0 first pointer in `PointersOps`.
+ /// \param PtrN last pointer in `PointersOps`.
+ /// \param SPtrInfo If the function return `true`, it also sets all the fields
+ /// of `SPtrInfo` necessary to generate the strided load later.
+ bool analyzeConstantStrideCandidate(
+ const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
+ const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
+ Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
/// Return true if an array of scalar loads can be replaced with a strided
/// load (with run-time stride).
@@ -6849,9 +6890,8 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
/// current graph (for masked gathers extra extractelement instructions
/// might be required).
bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
- Align Alignment, const int64_t Diff, Value *Ptr0,
- Value *PtrN, StridedPtrInfo &SPtrInfo) const {
- const size_t Sz = PointerOps.size();
+ Align Alignment, const int64_t Diff,
+ const size_t Sz) const {
if (Diff % (Sz - 1) != 0)
return false;
@@ -6875,27 +6915,40 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
return false;
if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
return false;
+ return true;
+ }
+ return false;
+}
- // Iterate through all pointers and check if all distances are
- // unique multiple of Dist.
- SmallSet<int64_t, 4> Dists;
- for (Value *Ptr : PointerOps) {
- int64_t Dist = 0;
- if (Ptr == PtrN)
- Dist = Diff;
- else if (Ptr != Ptr0)
- Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
- // If the strides are not the same or repeated, we can't
- // vectorize.
- if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
- break;
- }
- if (Dists.size() == Sz) {
- Type *StrideTy = DL->getIndexType(Ptr0->getType());
- SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
- SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
- return true;
- }
+bool BoUpSLP::analyzeConstantStrideCandidate(
+ const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
+ const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
+ Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
+ const size_t Sz = PointerOps.size();
+ if (!isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz))
+ return false;
+
+ int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
+
+ // Iterate through all pointers and check if all distances are
+ // unique multiple of Dist.
+ SmallSet<int64_t, 4> Dists;
+ for (Value *Ptr : PointerOps) {
+ int64_t Dist = 0;
+ if (Ptr == PtrN)
+ Dist = Diff;
+ else if (Ptr != Ptr0)
+ Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
+ // If the strides are not the same or repeated, we can't
+ // vectorize.
+ if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
+ break;
+ }
+ if (Dists.size() == Sz) {
+ Type *StrideTy = DL->getIndexType(Ptr0->getType());
+ SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
+ SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
+ return true;
}
return false;
}
@@ -6995,8 +7048,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
Align Alignment =
cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
->getAlign();
- if (isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN,
- SPtrInfo))
+ if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
+ *Diff, Ptr0, PtrN, SPtrInfo))
return LoadsState::StridedVectorize;
}
if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -17632,7 +17685,9 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
}
if (IsPHI ||
(!E->isGather() && E->State != TreeEntry::SplitVectorize &&
- E->doesNotNeedToSchedule()) ||
+ (E->doesNotNeedToSchedule() ||
+ (E->hasCopyableElements() && !E->isCopyableElement(LastInst) &&
+ isUsedOutsideBlock(LastInst)))) ||
(GatheredLoadsEntriesFirst.has_value() &&
E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
E->getOpcode() == Instruction::Load)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0e0b042..84d2ea6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -407,6 +407,10 @@ public:
VPBasicBlock *getParent() { return Parent; }
const VPBasicBlock *getParent() const { return Parent; }
+ /// \return the VPRegionBlock which the recipe belongs to.
+ VPRegionBlock *getRegion();
+ const VPRegionBlock *getRegion() const;
+
/// The method which generates the output IR instructions that correspond to
/// this VPRecipe, thereby "executing" the VPlan.
virtual void execute(VPTransformState &State) = 0;
@@ -4075,6 +4079,14 @@ public:
}
};
+inline VPRegionBlock *VPRecipeBase::getRegion() {
+ return getParent()->getParent();
+}
+
+inline const VPRegionBlock *VPRecipeBase::getRegion() const {
+ return getParent()->getParent();
+}
+
/// VPlan models a candidate for vectorization, encoding various decisions take
/// to produce efficient output IR, including which branches, basic-blocks and
/// output IR instructions to generate, and their cost. VPlan holds a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index f413c63..7e074c1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -377,7 +377,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
#ifndef NDEBUG
auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
- auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
+ VPRegionBlock *Region = R->getRegion();
if (Region && Region->isReplicator()) {
assert(Region->getNumSuccessors() == 1 &&
Region->getNumPredecessors() == 1 && "Expected SESE region!");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7a98c75..d1e67e6b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2352,7 +2352,7 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
return false;
auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
- auto *CanIV = getParent()->getParent()->getCanonicalIV();
+ auto *CanIV = getRegion()->getCanonicalIV();
return StartC && StartC->isZero() && StepC && StepC->isOne() &&
getScalarType() == CanIV->getScalarType();
}
@@ -3076,7 +3076,7 @@ static void scalarizeInstruction(const Instruction *Instr,
State.AC->registerAssumption(II);
assert(
- (RepRecipe->getParent()->getParent() ||
+ (RepRecipe->getRegion() ||
!RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
all_of(RepRecipe->operands(),
[](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
@@ -3268,7 +3268,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
to_vector(operands()), VF);
// If the recipe is not predicated (i.e. not in a replicate region), return
// the scalar cost. Otherwise handle predicated cost.
- if (!getParent()->getParent()->isReplicator())
+ if (!getRegion()->isReplicator())
return ScalarCost;
// Account for the phi nodes that we will create.
@@ -3284,7 +3284,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
case Instruction::Store: {
// TODO: See getMemInstScalarizationCost for how to handle replicating and
// predicated cases.
- const VPRegionBlock *ParentRegion = getParent()->getParent();
+ const VPRegionBlock *ParentRegion = getRegion();
if (ParentRegion && ParentRegion->isReplicator())
break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cae9aee8..f5f616f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1858,8 +1858,8 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
return nullptr;
VPRegionBlock *EnclosingLoopRegion =
HoistCandidate->getParent()->getEnclosingLoopRegion();
- assert((!HoistCandidate->getParent()->getParent() ||
- HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
+ assert((!HoistCandidate->getRegion() ||
+ HoistCandidate->getRegion() == EnclosingLoopRegion) &&
"CFG in VPlan should still be flat, without replicate regions");
// Hoist candidate was already visited, no need to hoist.
if (!Visited.insert(HoistCandidate).second)
@@ -2898,7 +2898,7 @@ void VPlanTransforms::replaceSymbolicStrides(
// evolution.
auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
auto *R = cast<VPRecipeBase>(&U);
- return R->getParent()->getParent() ||
+ return R->getRegion() ||
R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
};
ValueToSCEVMapTy RewriteMap;
@@ -3803,8 +3803,7 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
continue;
auto *DefR = cast<VPRecipeWithIRFlags>(&R);
auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
- VPRegionBlock *ParentRegion =
- cast<VPRecipeBase>(U)->getParent()->getParent();
+ VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
};
if ((isa<VPReplicateRecipe>(DefR) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index cf95ac0..9a2497e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -64,7 +64,7 @@ inline bool isSingleScalar(const VPValue *VPV) {
return true;
if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) {
- const VPRegionBlock *RegionOfR = Rep->getParent()->getParent();
+ const VPRegionBlock *RegionOfR = Rep->getRegion();
// Don't consider recipes in replicate regions as uniform yet; their first
// lane cannot be accessed when executing the replicate region for other
// lanes.
diff --git a/llvm/test/Bindings/llvm-c/debug_info_new_format.ll b/llvm/test/Bindings/llvm-c/debug_info_new_format.ll
index 83b37da..75e5fa0 100644
--- a/llvm/test/Bindings/llvm-c/debug_info_new_format.ll
+++ b/llvm/test/Bindings/llvm-c/debug_info_new_format.ll
@@ -3,37 +3,37 @@
; CHECK: ; ModuleID = 'debuginfo.c'
; CHECK-NEXT: source_filename = "debuginfo.c"
-
-; CHECK: define i64 @foo(i64 %0, i64 %1, <10 x i64> %2) !dbg !44 {
+
+; CHECK: define i64 @foo(i64 %0, i64 %1, <10 x i64> %2) !dbg !45 {
; CHECK-NEXT: entry:
-; CHECK-NEXT: #dbg_declare(i64 0, !49, !DIExpression(), !58)
-; CHECK-NEXT: #dbg_declare(i64 0, !50, !DIExpression(), !58)
-; CHECK-NEXT: #dbg_declare(i64 0, !51, !DIExpression(), !58)
-; CHECK-NEXT: #dbg_label(!59, !58)
+; CHECK-NEXT: #dbg_declare(i64 0, !50, !DIExpression(), !59)
+; CHECK-NEXT: #dbg_declare(i64 0, !51, !DIExpression(), !59)
+; CHECK-NEXT: #dbg_declare(i64 0, !52, !DIExpression(), !59)
+; CHECK-NEXT: #dbg_label(!60, !59)
; CHECK-NEXT: br label %vars
-; CHECK-NEXT: #dbg_label(!60, !58)
+; CHECK-NEXT: #dbg_label(!61, !59)
; CHECK-NEXT: br label %vars
; CHECK: vars: ; preds = %entry, %entry
; CHECK-NEXT: %p1 = phi i64 [ 0, %entry ]
; CHECK-NEXT: %p2 = phi i64 [ 0, %entry ]
-; CHECK-NEXT: #dbg_value(i64 0, !42, !DIExpression(DW_OP_constu, 0, DW_OP_stack_value), !61)
-; CHECK-NEXT: #dbg_value(i64 1, !52, !DIExpression(DW_OP_constu, 1, DW_OP_stack_value), !61)
+; CHECK-NEXT: #dbg_value(i64 0, !43, !DIExpression(DW_OP_constu, 0, DW_OP_stack_value), !62)
+; CHECK-NEXT: #dbg_value(i64 1, !53, !DIExpression(DW_OP_constu, 1, DW_OP_stack_value), !62)
; CHECK-NEXT: %a = add i64 %p1, %p2
; CHECK-NEXT: ret i64 0
; CHECK-NEXT: }
; CHECK: !llvm.dbg.cu = !{!0}
-; CHECK-NEXT: !FooType = !{!33}
+; CHECK-NEXT: !FooType = !{!34}
; CHECK-NEXT: !EnumTest = !{!3}
; CHECK-NEXT: !LargeEnumTest = !{!11}
-; CHECK-NEXT: !SubrangeType = !{!36}
-; CHECK-NEXT: !SetType1 = !{!37}
-; CHECK-NEXT: !SetType2 = !{!38}
-; CHECK-NEXT: !DynType = !{!39}
-; CHECK-NEXT: !ClassType = !{!54}
+; CHECK-NEXT: !SubrangeType = !{!37}
+; CHECK-NEXT: !SetType1 = !{!38}
+; CHECK-NEXT: !SetType2 = !{!39}
+; CHECK-NEXT: !DynType = !{!40}
+; CHECK-NEXT: !ClassType = !{!55}
-; CHECK: !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "llvm-c-test", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !16, imports: !24, macros: !28, splitDebugInlining: false, sysroot: "/")
+; CHECK: !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "llvm-c-test", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !16, imports: !24, macros: !29, splitDebugInlining: false, sysroot: "/")
; CHECK-NEXT: !1 = !DIFile(filename: "debuginfo.c", directory: ".")
; CHECK-NEXT: !2 = !{!3, !11}
; CHECK-NEXT: !3 = !DICompositeType(tag: DW_TAG_enumeration_type, name: "EnumTest", scope: !4, file: !1, baseType: !6, size: 64, elements: !7)
@@ -57,41 +57,42 @@
; CHECK-NEXT: !21 = !DIGlobalVariableExpression(var: !22, expr: !DIExpression(DW_OP_constu, 0, DW_OP_stack_value))
; CHECK-NEXT: !22 = distinct !DIGlobalVariable(name: "global", scope: !5, file: !1, line: 1, type: !23, isLocal: true, isDefinition: true)
; CHECK-NEXT: !23 = !DIDerivedType(tag: DW_TAG_typedef, name: "int64_t", scope: !1, file: !1, line: 42, baseType: !6)
-; CHECK-NEXT: !24 = !{!25, !27}
-; CHECK-NEXT: !25 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !5, entity: !26, file: !1, line: 42)
+; CHECK-NEXT: !24 = !{!25, !28}
+; CHECK-NEXT: !25 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !5, entity: !26, file: !27, line: 42)
; CHECK-NEXT: !26 = !DIModule(scope: null, name: "llvm-c-test-import", includePath: "/test/include/llvm-c-test-import.h")
-; CHECK-NEXT: !27 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !5, entity: !25, file: !1, line: 42)
-; CHECK-NEXT: !28 = !{!29}
-; CHECK-NEXT: !29 = !DIMacroFile(file: !1, nodes: !30)
-; CHECK-NEXT: !30 = !{!31, !32}
-; CHECK-NEXT: !31 = !DIMacro(type: DW_MACINFO_define, name: "SIMPLE_DEFINE")
-; CHECK-NEXT: !32 = !DIMacro(type: DW_MACINFO_define, name: "VALUE_DEFINE", value: "1")
-; CHECK-NEXT: !33 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !34, size: 192, dwarfAddressSpace: 0)
-; CHECK-NEXT: !34 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyStruct", scope: !4, file: !1, size: 192, elements: !35, runtimeLang: DW_LANG_C89, identifier: "MyStruct")
-; CHECK-NEXT: !35 = !{!6, !6, !6}
-; CHECK-NEXT: !36 = !DISubrangeType(name: "foo", scope: !1, file: !1, line: 42, size: 64, baseType: !6, lowerBound: i64 0, upperBound: i64 1, stride: i64 8, bias: i64 4)
-; CHECK-NEXT: !37 = !DIDerivedType(tag: DW_TAG_set_type, name: "enumset", scope: !1, file: !1, line: 42, baseType: !3, size: 64)
-; CHECK-NEXT: !38 = !DIDerivedType(tag: DW_TAG_set_type, name: "subrangeset", scope: !1, file: !1, line: 42, baseType: !36, size: 64)
-; CHECK-NEXT: !39 = !DICompositeType(tag: DW_TAG_array_type, name: "foo", scope: !1, file: !1, line: 42, baseType: !6, size: 640, elements: !40, dataLocation: !DIExpression(), associated: !42, rank: !DIExpression())
-; CHECK-NEXT: !40 = !{!41}
-; CHECK-NEXT: !41 = !DISubrange(count: 10, lowerBound: 0)
-; CHECK-NEXT: !42 = !DILocalVariable(name: "d", scope: !43, file: !1, line: 43, type: !6)
-; CHECK-NEXT: !43 = distinct !DILexicalBlock(scope: !44, file: !1, line: 42)
-; CHECK-NEXT: !44 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !1, file: !1, line: 42, type: !45, scopeLine: 42, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !48)
-; CHECK-NEXT: !45 = !DISubroutineType(types: !46)
-; CHECK-NEXT: !46 = !{!6, !6, !47}
-; CHECK-NEXT: !47 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 640, flags: DIFlagVector, elements: !40)
-; CHECK-NEXT: !48 = !{!49, !50, !51, !42, !52, !53}
-; CHECK-NEXT: !49 = !DILocalVariable(name: "a", arg: 1, scope: !44, file: !1, line: 42, type: !6)
-; CHECK-NEXT: !50 = !DILocalVariable(name: "b", arg: 2, scope: !44, file: !1, line: 42, type: !6)
-; CHECK-NEXT: !51 = !DILocalVariable(name: "c", arg: 3, scope: !44, file: !1, line: 42, type: !47)
-; CHECK-NEXT: !52 = !DILocalVariable(name: "e", scope: !43, file: !1, line: 44, type: !6)
-; CHECK-NEXT: !53 = !DILabel(scope: !44, name: "label3", file: !1, line: 42)
-; CHECK-NEXT: !54 = !DICompositeType(tag: DW_TAG_class_type, name: "Class", scope: !4, file: !1, size: 192, flags: DIFlagFwdDecl, elements: !55, identifier: "FooClass")
-; CHECK-NEXT: !55 = !{!56}
-; CHECK-NEXT: !56 = !{!6, !6, !57}
-; CHECK-NEXT: !57 = !DIBasicType(name: "Int32", size: 32)
-; CHECK-NEXT: !58 = !DILocation(line: 42, scope: !44)
-; CHECK-NEXT: !59 = !DILabel(scope: !44, name: "label1", file: !1, line: 42)
-; CHECK-NEXT: !60 = !DILabel(scope: !44, name: "label2", file: !1, line: 42)
-; CHECK-NEXT: !61 = !DILocation(line: 43, scope: !44)
+; CHECK-NEXT: !27 = !DIFile(filename: "debuginfo.c", directory: ".", checksumkind: CSK_MD5, checksum: "1234", source: "source")
+; CHECK-NEXT: !28 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !5, entity: !25, file: !1, line: 42)
+; CHECK-NEXT: !29 = !{!30}
+; CHECK-NEXT: !30 = !DIMacroFile(file: !1, nodes: !31)
+; CHECK-NEXT: !31 = !{!32, !33}
+; CHECK-NEXT: !32 = !DIMacro(type: DW_MACINFO_define, name: "SIMPLE_DEFINE")
+; CHECK-NEXT: !33 = !DIMacro(type: DW_MACINFO_define, name: "VALUE_DEFINE", value: "1")
+; CHECK-NEXT: !34 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !35, size: 192, dwarfAddressSpace: 0)
+; CHECK-NEXT: !35 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyStruct", scope: !4, file: !1, size: 192, elements: !36, runtimeLang: DW_LANG_C89, identifier: "MyStruct")
+; CHECK-NEXT: !36 = !{!6, !6, !6}
+; CHECK-NEXT: !37 = !DISubrangeType(name: "foo", scope: !1, file: !1, line: 42, size: 64, baseType: !6, lowerBound: i64 0, upperBound: i64 1, stride: i64 8, bias: i64 4)
+; CHECK-NEXT: !38 = !DIDerivedType(tag: DW_TAG_set_type, name: "enumset", scope: !1, file: !1, line: 42, baseType: !3, size: 64)
+; CHECK-NEXT: !39 = !DIDerivedType(tag: DW_TAG_set_type, name: "subrangeset", scope: !1, file: !1, line: 42, baseType: !37, size: 64)
+; CHECK-NEXT: !40 = !DICompositeType(tag: DW_TAG_array_type, name: "foo", scope: !1, file: !1, line: 42, baseType: !6, size: 640, elements: !41, dataLocation: !DIExpression(), associated: !43, rank: !DIExpression())
+; CHECK-NEXT: !41 = !{!42}
+; CHECK-NEXT: !42 = !DISubrange(count: 10, lowerBound: 0)
+; CHECK-NEXT: !43 = !DILocalVariable(name: "d", scope: !44, file: !1, line: 43, type: !6)
+; CHECK-NEXT: !44 = distinct !DILexicalBlock(scope: !45, file: !1, line: 42)
+; CHECK-NEXT: !45 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !1, file: !1, line: 42, type: !46, scopeLine: 42, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !49)
+; CHECK-NEXT: !46 = !DISubroutineType(types: !47)
+; CHECK-NEXT: !47 = !{!6, !6, !48}
+; CHECK-NEXT: !48 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 640, flags: DIFlagVector, elements: !41)
+; CHECK-NEXT: !49 = !{!50, !51, !52, !43, !53, !54}
+; CHECK-NEXT: !50 = !DILocalVariable(name: "a", arg: 1, scope: !45, file: !1, line: 42, type: !6)
+; CHECK-NEXT: !51 = !DILocalVariable(name: "b", arg: 2, scope: !45, file: !1, line: 42, type: !6)
+; CHECK-NEXT: !52 = !DILocalVariable(name: "c", arg: 3, scope: !45, file: !1, line: 42, type: !48)
+; CHECK-NEXT: !53 = !DILocalVariable(name: "e", scope: !44, file: !1, line: 44, type: !6)
+; CHECK-NEXT: !54 = !DILabel(scope: !45, name: "label3", file: !1, line: 42)
+; CHECK-NEXT: !55 = !DICompositeType(tag: DW_TAG_class_type, name: "Class", scope: !4, file: !1, size: 192, flags: DIFlagFwdDecl, elements: !56, identifier: "FooClass")
+; CHECK-NEXT: !56 = !{!57}
+; CHECK-NEXT: !57 = !{!6, !6, !58}
+; CHECK-NEXT: !58 = !DIBasicType(name: "Int32", size: 32)
+; CHECK-NEXT: !59 = !DILocation(line: 42, scope: !45)
+; CHECK-NEXT: !60 = !DILabel(scope: !45, name: "label1", file: !1, line: 42)
+; CHECK-NEXT: !61 = !DILabel(scope: !45, name: "label2", file: !1, line: 42)
+; CHECK-NEXT: !62 = !DILocation(line: 43, scope: !45)
diff --git a/llvm/test/CodeGen/AMDGPU/abs_i16.ll b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
index 7633ba0..66cc7f3 100644
--- a/llvm/test/CodeGen/AMDGPU/abs_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
@@ -15,7 +15,7 @@ define i16 @abs_i16(i16 %arg) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX6-NEXT: v_max_i32_e32 v0, v1, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: abs_i16:
@@ -23,7 +23,7 @@ define i16 @abs_i16(i16 %arg) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX7-NEXT: v_max_i32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_i16:
@@ -97,9 +97,9 @@ define <2 x i16> @v_abs_v2i16(<2 x i16> %arg) {
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
+; GFX6-NEXT: v_max_i32_e32 v0, v2, v0
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
+; GFX6-NEXT: v_max_i32_e32 v1, v2, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -110,9 +110,9 @@ define <2 x i16> @v_abs_v2i16(<2 x i16> %arg) {
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v2
+; GFX7-NEXT: v_max_i32_e32 v0, v2, v0
; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v2
+; GFX7-NEXT: v_max_i32_e32 v1, v2, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -172,15 +172,15 @@ define <3 x i16> @v_abs_v3i16(<3 x i16> %arg) {
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v3, v0
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v3, v1
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_max_i32_e32 v2, v3, v2
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v3i16:
@@ -189,15 +189,15 @@ define <3 x i16> @v_abs_v3i16(<3 x i16> %arg) {
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v3, v0
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX7-NEXT: v_max_i32_e32 v1, v3, v1
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_max_i32_e32 v2, v3, v2
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v3i16:
@@ -262,47 +262,45 @@ define <4 x i16> @v_abs_v4i16(<4 x i16> %arg) {
; GFX6-LABEL: v_abs_v4i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v4, v0
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v4, v1
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v4
+; GFX6-NEXT: v_max_i32_e32 v2, v4, v2
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
-; GFX6-NEXT: v_max_i32_e32 v3, v3, v4
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v4, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v4i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v4, v0
+; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v1, v4, v1
; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v4
+; GFX7-NEXT: v_max_i32_e32 v2, v4, v2
; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
-; GFX7-NEXT: v_max_i32_e32 v3, v3, v4
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v4, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v4i16:
@@ -370,63 +368,61 @@ define <6 x i16> @v_abs_v6i16(<6 x i16> %arg) {
; GFX6-LABEL: v_abs_v6i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v0
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v6, v0
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v1
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v6, v1
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v4
+; GFX6-NEXT: v_max_i32_e32 v4, v6, v4
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v5
+; GFX6-NEXT: v_max_i32_e32 v5, v6, v5
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v6
+; GFX6-NEXT: v_max_i32_e32 v2, v6, v2
; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
-; GFX6-NEXT: v_max_i32_e32 v3, v3, v6
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
-; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v3, v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v5
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v4
-; GFX6-NEXT: v_max_i32_e32 v5, v5, v3
-; GFX6-NEXT: v_max_i32_e32 v1, v4, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_or_b32_e32 v4, v1, v3
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v6i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v0
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v6, v0
+; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v1
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v1, v6, v1
+; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v4
+; GFX7-NEXT: v_max_i32_e32 v4, v6, v4
+; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v5
+; GFX7-NEXT: v_max_i32_e32 v5, v6, v5
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v6
+; GFX7-NEXT: v_max_i32_e32 v2, v6, v2
; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
-; GFX7-NEXT: v_max_i32_e32 v3, v3, v6
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
-; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v3, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v5
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 0, v4
-; GFX7-NEXT: v_max_i32_e32 v5, v5, v3
-; GFX7-NEXT: v_max_i32_e32 v1, v4, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_or_b32_e32 v4, v1, v3
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v6i16:
@@ -509,83 +505,79 @@ define <8 x i16> @v_abs_v8i16(<8 x i16> %arg) {
; GFX6-LABEL: v_abs_v8i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v0
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v8, v0
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v1
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v8, v1
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v4, v8, v4
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v5
; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v5, v8, v5
; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v6
-; GFX6-NEXT: v_max_i32_e32 v6, v6, v8
+; GFX6-NEXT: v_max_i32_e32 v6, v8, v6
; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v7
-; GFX6-NEXT: v_max_i32_e32 v7, v7, v8
-; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX6-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX6-NEXT: v_max_i32_e32 v7, v8, v7
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX6-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v8
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v8, v2
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v8, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v8
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v8i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v0
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v8, v0
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v1
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v1, v8, v1
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v4, v8, v4
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v5
; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v5, v8, v5
; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v6
-; GFX7-NEXT: v_max_i32_e32 v6, v6, v8
+; GFX7-NEXT: v_max_i32_e32 v6, v8, v6
; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v7
-; GFX7-NEXT: v_max_i32_e32 v7, v7, v8
-; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX7-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX7-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX7-NEXT: v_max_i32_e32 v7, v8, v7
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX7-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v8
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v8, v2
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v8, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v8
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v8i16:
@@ -682,155 +674,147 @@ define <16 x i16> @v_abs_v16i16(<16 x i16> %arg) {
; GFX6-LABEL: v_abs_v16i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v16, v0
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v1
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v16, v1
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v4
+; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v4, v16, v4
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v5
+; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v5, v16, v5
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v8
+; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v8, v16, v8
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v9
+; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v9, v16, v9
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v12
; GFX6-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v12, v16, v12
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v13
; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v13, v16, v13
; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v14
-; GFX6-NEXT: v_max_i32_e32 v14, v14, v16
+; GFX6-NEXT: v_max_i32_e32 v14, v16, v14
; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v15
-; GFX6-NEXT: v_max_i32_e32 v15, v15, v16
-; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v14, v14, v15
-; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
-; GFX6-NEXT: v_max_i32_e32 v12, v12, v15
-; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
-; GFX6-NEXT: v_max_i32_e32 v13, v13, v15
+; GFX6-NEXT: v_max_i32_e32 v15, v16, v15
; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX6-NEXT: v_bfe_i32 v11, v11, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v12, v12, v13
-; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
-; GFX6-NEXT: v_max_i32_e32 v10, v10, v13
-; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
-; GFX6-NEXT: v_max_i32_e32 v11, v11, v13
-; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
-; GFX6-NEXT: v_max_i32_e32 v8, v8, v11
-; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
-; GFX6-NEXT: v_max_i32_e32 v9, v9, v11
+; GFX6-NEXT: v_or_b32_e32 v14, v14, v16
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v10
+; GFX6-NEXT: v_max_i32_e32 v10, v16, v10
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v11
+; GFX6-NEXT: v_max_i32_e32 v11, v16, v11
; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v11
; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v8, v8, v9
-; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
-; GFX6-NEXT: v_max_i32_e32 v6, v6, v9
-; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
-; GFX6-NEXT: v_max_i32_e32 v7, v7, v9
-; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX6-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX6-NEXT: v_or_b32_e32 v10, v10, v16
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
+; GFX6-NEXT: v_max_i32_e32 v6, v16, v6
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v7
+; GFX6-NEXT: v_max_i32_e32 v7, v16, v7
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v7
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX6-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v16
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v16, v2
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v16, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v16
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX6-NEXT: v_alignbit_b32 v9, v10, v8, 16
-; GFX6-NEXT: v_alignbit_b32 v13, v14, v12, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX6-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX6-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; GFX6-NEXT: v_alignbit_b32 v13, v14, v13, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v16i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v16, v0
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v1
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v1, v16, v1
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v4
+; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v4, v16, v4
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v5
+; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v5, v16, v5
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v8
+; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v8, v16, v8
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v9
+; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v9, v16, v9
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v12
; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v12, v16, v12
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v13
; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v13, v16, v13
; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v14
-; GFX7-NEXT: v_max_i32_e32 v14, v14, v16
+; GFX7-NEXT: v_max_i32_e32 v14, v16, v14
; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v15
-; GFX7-NEXT: v_max_i32_e32 v15, v15, v16
-; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v14, v14, v15
-; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
-; GFX7-NEXT: v_max_i32_e32 v12, v12, v15
-; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
-; GFX7-NEXT: v_max_i32_e32 v13, v13, v15
+; GFX7-NEXT: v_max_i32_e32 v15, v16, v15
; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
-; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
-; GFX7-NEXT: v_max_i32_e32 v10, v10, v13
-; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
-; GFX7-NEXT: v_max_i32_e32 v11, v11, v13
-; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
-; GFX7-NEXT: v_max_i32_e32 v8, v8, v11
-; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
-; GFX7-NEXT: v_max_i32_e32 v9, v9, v11
+; GFX7-NEXT: v_or_b32_e32 v14, v14, v16
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v10
+; GFX7-NEXT: v_max_i32_e32 v10, v16, v10
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v11
+; GFX7-NEXT: v_max_i32_e32 v11, v16, v11
; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v11
; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
-; GFX7-NEXT: v_max_i32_e32 v6, v6, v9
-; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
-; GFX7-NEXT: v_max_i32_e32 v7, v7, v9
-; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX7-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX7-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX7-NEXT: v_or_b32_e32 v10, v10, v16
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
+; GFX7-NEXT: v_max_i32_e32 v6, v16, v6
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v7
+; GFX7-NEXT: v_max_i32_e32 v7, v16, v7
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v7
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX7-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v16
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v16, v2
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v16
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX7-NEXT: v_alignbit_b32 v9, v10, v8, 16
-; GFX7-NEXT: v_alignbit_b32 v13, v14, v12, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v16i16:
@@ -974,303 +958,287 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) {
; GFX6-LABEL: v_abs_v32i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v0
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v31, v0
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v1
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v31, v1
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v4
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v4, v31, v4
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v5
+; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v5, v31, v5
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v8
+; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v8, v31, v8
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v9
+; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v9, v31, v9
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v12
+; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v12, v31, v12
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v13
+; GFX6-NEXT: v_bfe_i32 v16, v16, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v13, v31, v13
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v16
+; GFX6-NEXT: v_bfe_i32 v17, v17, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v16, v31, v16
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v17
+; GFX6-NEXT: v_bfe_i32 v20, v20, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v17, v31, v17
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v20
+; GFX6-NEXT: v_bfe_i32 v21, v21, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v20, v31, v20
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v21
+; GFX6-NEXT: v_bfe_i32 v24, v24, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v21, v31, v21
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v24
+; GFX6-NEXT: v_bfe_i32 v25, v25, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v24, v31, v24
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v25
; GFX6-NEXT: v_bfe_i32 v28, v28, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v25, v31, v25
; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v28
; GFX6-NEXT: v_bfe_i32 v29, v29, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v28, v28, v31
+; GFX6-NEXT: v_max_i32_e32 v28, v31, v28
; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v29
; GFX6-NEXT: v_bfe_i32 v30, v30, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v29, v29, v31
+; GFX6-NEXT: v_max_i32_e32 v29, v31, v29
; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v30
+; GFX6-NEXT: v_max_i32_e32 v30, v31, v30
+; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX6-NEXT: v_bfe_i32 v26, v26, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v30, v30, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v26
; GFX6-NEXT: v_bfe_i32 v27, v27, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v26, v26, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v27
-; GFX6-NEXT: v_bfe_i32 v24, v24, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v27, v27, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v24
-; GFX6-NEXT: v_bfe_i32 v25, v25, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v24, v24, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v25
; GFX6-NEXT: v_bfe_i32 v22, v22, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v25, v25, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v22
; GFX6-NEXT: v_bfe_i32 v23, v23, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v22, v22, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v23
-; GFX6-NEXT: v_max_i32_e32 v23, v23, v31
-; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX6-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX6-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX6-NEXT: v_or_b32_e32 v22, v22, v23
-; GFX6-NEXT: v_or_b32_e32 v24, v24, v25
-; GFX6-NEXT: v_bfe_i32 v21, v21, 0, 16
-; GFX6-NEXT: v_bfe_i32 v20, v20, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v29, 16, v29
-; GFX6-NEXT: v_or_b32_e32 v28, v28, v29
-; GFX6-NEXT: v_sub_i32_e32 v29, vcc, 0, v20
-; GFX6-NEXT: v_max_i32_e32 v20, v20, v29
; GFX6-NEXT: v_bfe_i32 v18, v18, 0, 16
; GFX6-NEXT: v_bfe_i32 v19, v19, 0, 16
-; GFX6-NEXT: v_bfe_i32 v16, v16, 0, 16
-; GFX6-NEXT: v_bfe_i32 v17, v17, 0, 16
; GFX6-NEXT: v_bfe_i32 v14, v14, 0, 16
; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 16
-; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16
-; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16
; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 16
; GFX6-NEXT: v_bfe_i32 v11, v11, 0, 16
-; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16
; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; GFX6-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX6-NEXT: v_lshrrev_b32_e32 v27, 16, v26
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_bfe_i32 v23, v31, 0, 16
-; GFX6-NEXT: v_sub_i32_e32 v25, vcc, 0, v23
-; GFX6-NEXT: v_max_i32_e32 v23, v23, v25
-; GFX6-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX6-NEXT: v_or_b32_e32 v30, v30, v23
-; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 0, v21
-; GFX6-NEXT: v_max_i32_e32 v21, v21, v23
-; GFX6-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX6-NEXT: v_or_b32_e32 v20, v20, v21
-; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 0, v18
-; GFX6-NEXT: v_max_i32_e32 v18, v18, v21
-; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 0, v19
-; GFX6-NEXT: v_max_i32_e32 v19, v19, v21
-; GFX6-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; GFX6-NEXT: v_or_b32_e32 v18, v18, v19
-; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v16
-; GFX6-NEXT: v_max_i32_e32 v16, v16, v19
-; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v17
-; GFX6-NEXT: v_max_i32_e32 v17, v17, v19
-; GFX6-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX6-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 0, v14
-; GFX6-NEXT: v_max_i32_e32 v14, v14, v17
-; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 0, v15
-; GFX6-NEXT: v_max_i32_e32 v15, v15, v17
-; GFX6-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX6-NEXT: v_or_b32_e32 v14, v14, v15
-; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
-; GFX6-NEXT: v_max_i32_e32 v12, v12, v15
-; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
-; GFX6-NEXT: v_max_i32_e32 v13, v13, v15
-; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX6-NEXT: v_or_b32_e32 v12, v12, v13
-; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
-; GFX6-NEXT: v_max_i32_e32 v10, v10, v13
-; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
-; GFX6-NEXT: v_max_i32_e32 v11, v11, v13
-; GFX6-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX6-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
-; GFX6-NEXT: v_max_i32_e32 v8, v8, v11
-; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
-; GFX6-NEXT: v_max_i32_e32 v9, v9, v11
-; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX6-NEXT: v_or_b32_e32 v8, v8, v9
-; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
-; GFX6-NEXT: v_max_i32_e32 v6, v6, v9
-; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
-; GFX6-NEXT: v_max_i32_e32 v7, v7, v9
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX6-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX6-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX6-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX6-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX6-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX6-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX6-NEXT: v_alignbit_b32 v9, v10, v8, 16
-; GFX6-NEXT: v_alignbit_b32 v13, v14, v12, 16
-; GFX6-NEXT: v_alignbit_b32 v17, v18, v16, 16
-; GFX6-NEXT: v_alignbit_b32 v21, v22, v20, 16
-; GFX6-NEXT: v_alignbit_b32 v25, v26, v24, 16
-; GFX6-NEXT: v_alignbit_b32 v29, v30, v28, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GFX6-NEXT: v_lshrrev_b32_e32 v19, 16, v18
-; GFX6-NEXT: v_lshrrev_b32_e32 v23, 16, v22
-; GFX6-NEXT: v_lshrrev_b32_e32 v31, 16, v30
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX6-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX6-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX6-NEXT: v_or_b32_e32 v20, v20, v21
+; GFX6-NEXT: v_or_b32_e32 v24, v24, v25
+; GFX6-NEXT: v_or_b32_e32 v28, v28, v29
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v31, v31, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v31
+; GFX6-NEXT: v_max_i32_e32 v31, v32, v31
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v31
+; GFX6-NEXT: v_or_b32_e32 v30, v30, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v26
+; GFX6-NEXT: v_max_i32_e32 v26, v32, v26
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v27
+; GFX6-NEXT: v_max_i32_e32 v27, v32, v27
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v27
+; GFX6-NEXT: v_or_b32_e32 v26, v26, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v22
+; GFX6-NEXT: v_max_i32_e32 v22, v32, v22
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v23
+; GFX6-NEXT: v_max_i32_e32 v23, v32, v23
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v23
+; GFX6-NEXT: v_or_b32_e32 v22, v22, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v18
+; GFX6-NEXT: v_max_i32_e32 v18, v32, v18
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v19
+; GFX6-NEXT: v_max_i32_e32 v19, v32, v19
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v19
+; GFX6-NEXT: v_or_b32_e32 v18, v18, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v14
+; GFX6-NEXT: v_max_i32_e32 v14, v32, v14
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v15
+; GFX6-NEXT: v_max_i32_e32 v15, v32, v15
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX6-NEXT: v_or_b32_e32 v14, v14, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v10
+; GFX6-NEXT: v_max_i32_e32 v10, v32, v10
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v11
+; GFX6-NEXT: v_max_i32_e32 v11, v32, v11
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX6-NEXT: v_or_b32_e32 v10, v10, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v6
+; GFX6-NEXT: v_max_i32_e32 v6, v32, v6
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v7
+; GFX6-NEXT: v_max_i32_e32 v7, v32, v7
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v7
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v32, v2
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v32, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v32
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX6-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; GFX6-NEXT: v_alignbit_b32 v13, v14, v13, 16
+; GFX6-NEXT: v_alignbit_b32 v17, v18, v17, 16
+; GFX6-NEXT: v_alignbit_b32 v21, v22, v21, 16
+; GFX6-NEXT: v_alignbit_b32 v25, v26, v25, 16
+; GFX6-NEXT: v_alignbit_b32 v29, v30, v29, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v32i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v0
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v31, v0
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v1
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v1, v31, v1
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v4
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v4, v31, v4
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v5
+; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v5, v31, v5
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v8
+; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v8, v31, v8
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v9
+; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v9, v31, v9
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v12
+; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v12, v31, v12
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v13
+; GFX7-NEXT: v_bfe_i32 v16, v16, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v13, v31, v13
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v16
+; GFX7-NEXT: v_bfe_i32 v17, v17, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v16, v31, v16
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v17
+; GFX7-NEXT: v_bfe_i32 v20, v20, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v17, v31, v17
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v20
+; GFX7-NEXT: v_bfe_i32 v21, v21, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v20, v31, v20
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v21
+; GFX7-NEXT: v_bfe_i32 v24, v24, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v21, v31, v21
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v24
+; GFX7-NEXT: v_bfe_i32 v25, v25, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v24, v31, v24
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v25
; GFX7-NEXT: v_bfe_i32 v28, v28, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v25, v31, v25
; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v28
; GFX7-NEXT: v_bfe_i32 v29, v29, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v28, v28, v31
+; GFX7-NEXT: v_max_i32_e32 v28, v31, v28
; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v29
; GFX7-NEXT: v_bfe_i32 v30, v30, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v29, v29, v31
+; GFX7-NEXT: v_max_i32_e32 v29, v31, v29
; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v30
+; GFX7-NEXT: v_max_i32_e32 v30, v31, v30
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX7-NEXT: v_bfe_i32 v26, v26, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v30, v30, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v26
; GFX7-NEXT: v_bfe_i32 v27, v27, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v26, v26, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v27
-; GFX7-NEXT: v_bfe_i32 v24, v24, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v27, v27, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v24
-; GFX7-NEXT: v_bfe_i32 v25, v25, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v24, v24, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v25
; GFX7-NEXT: v_bfe_i32 v22, v22, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v25, v25, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v22
; GFX7-NEXT: v_bfe_i32 v23, v23, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v22, v22, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v23
-; GFX7-NEXT: v_max_i32_e32 v23, v23, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX7-NEXT: v_or_b32_e32 v22, v22, v23
-; GFX7-NEXT: v_or_b32_e32 v24, v24, v25
-; GFX7-NEXT: v_bfe_i32 v21, v21, 0, 16
-; GFX7-NEXT: v_bfe_i32 v20, v20, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
-; GFX7-NEXT: v_or_b32_e32 v28, v28, v29
-; GFX7-NEXT: v_sub_i32_e32 v29, vcc, 0, v20
-; GFX7-NEXT: v_max_i32_e32 v20, v20, v29
; GFX7-NEXT: v_bfe_i32 v18, v18, 0, 16
; GFX7-NEXT: v_bfe_i32 v19, v19, 0, 16
-; GFX7-NEXT: v_bfe_i32 v16, v16, 0, 16
-; GFX7-NEXT: v_bfe_i32 v17, v17, 0, 16
; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 16
; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 16
-; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16
-; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16
; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 16
; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 16
-; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16
; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16
; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; GFX7-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v26
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_bfe_i32 v23, v31, 0, 16
-; GFX7-NEXT: v_sub_i32_e32 v25, vcc, 0, v23
-; GFX7-NEXT: v_max_i32_e32 v23, v23, v25
-; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX7-NEXT: v_or_b32_e32 v30, v30, v23
-; GFX7-NEXT: v_sub_i32_e32 v23, vcc, 0, v21
-; GFX7-NEXT: v_max_i32_e32 v21, v21, v23
-; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX7-NEXT: v_or_b32_e32 v20, v20, v21
-; GFX7-NEXT: v_sub_i32_e32 v21, vcc, 0, v18
-; GFX7-NEXT: v_max_i32_e32 v18, v18, v21
-; GFX7-NEXT: v_sub_i32_e32 v21, vcc, 0, v19
-; GFX7-NEXT: v_max_i32_e32 v19, v19, v21
-; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; GFX7-NEXT: v_or_b32_e32 v18, v18, v19
-; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v16
-; GFX7-NEXT: v_max_i32_e32 v16, v16, v19
-; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v17
-; GFX7-NEXT: v_max_i32_e32 v17, v17, v19
-; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX7-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX7-NEXT: v_sub_i32_e32 v17, vcc, 0, v14
-; GFX7-NEXT: v_max_i32_e32 v14, v14, v17
-; GFX7-NEXT: v_sub_i32_e32 v17, vcc, 0, v15
-; GFX7-NEXT: v_max_i32_e32 v15, v15, v17
-; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX7-NEXT: v_or_b32_e32 v14, v14, v15
-; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
-; GFX7-NEXT: v_max_i32_e32 v12, v12, v15
-; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
-; GFX7-NEXT: v_max_i32_e32 v13, v13, v15
-; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
-; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
-; GFX7-NEXT: v_max_i32_e32 v10, v10, v13
-; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
-; GFX7-NEXT: v_max_i32_e32 v11, v11, v13
-; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX7-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
-; GFX7-NEXT: v_max_i32_e32 v8, v8, v11
-; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
-; GFX7-NEXT: v_max_i32_e32 v9, v9, v11
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
-; GFX7-NEXT: v_max_i32_e32 v6, v6, v9
-; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
-; GFX7-NEXT: v_max_i32_e32 v7, v7, v9
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX7-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX7-NEXT: v_max_i32_e32 v5, v5, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX7-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX7-NEXT: v_alignbit_b32 v9, v10, v8, 16
-; GFX7-NEXT: v_alignbit_b32 v13, v14, v12, 16
-; GFX7-NEXT: v_alignbit_b32 v17, v18, v16, 16
-; GFX7-NEXT: v_alignbit_b32 v21, v22, v20, 16
-; GFX7-NEXT: v_alignbit_b32 v25, v26, v24, 16
-; GFX7-NEXT: v_alignbit_b32 v29, v30, v28, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v18
-; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v22
-; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v30
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX7-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX7-NEXT: v_or_b32_e32 v20, v20, v21
+; GFX7-NEXT: v_or_b32_e32 v24, v24, v25
+; GFX7-NEXT: v_or_b32_e32 v28, v28, v29
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v31, v31, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v31
+; GFX7-NEXT: v_max_i32_e32 v31, v32, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v31
+; GFX7-NEXT: v_or_b32_e32 v30, v30, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v26
+; GFX7-NEXT: v_max_i32_e32 v26, v32, v26
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v27
+; GFX7-NEXT: v_max_i32_e32 v27, v32, v27
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v27
+; GFX7-NEXT: v_or_b32_e32 v26, v26, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v22
+; GFX7-NEXT: v_max_i32_e32 v22, v32, v22
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v23
+; GFX7-NEXT: v_max_i32_e32 v23, v32, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v23
+; GFX7-NEXT: v_or_b32_e32 v22, v22, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v18
+; GFX7-NEXT: v_max_i32_e32 v18, v32, v18
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v19
+; GFX7-NEXT: v_max_i32_e32 v19, v32, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v19
+; GFX7-NEXT: v_or_b32_e32 v18, v18, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v14
+; GFX7-NEXT: v_max_i32_e32 v14, v32, v14
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v15
+; GFX7-NEXT: v_max_i32_e32 v15, v32, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX7-NEXT: v_or_b32_e32 v14, v14, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v10
+; GFX7-NEXT: v_max_i32_e32 v10, v32, v10
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v11
+; GFX7-NEXT: v_max_i32_e32 v11, v32, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX7-NEXT: v_or_b32_e32 v10, v10, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v6
+; GFX7-NEXT: v_max_i32_e32 v6, v32, v6
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v7
+; GFX7-NEXT: v_max_i32_e32 v7, v32, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v32, v2
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v32, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v32
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
+; GFX7-NEXT: v_alignbit_b32 v17, v18, v17, 16
+; GFX7-NEXT: v_alignbit_b32 v21, v22, v21, 16
+; GFX7-NEXT: v_alignbit_b32 v25, v26, v25, 16
+; GFX7-NEXT: v_alignbit_b32 v29, v30, v29, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v32i16:
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index d25bfbb..12309f3 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -780,7 +780,7 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -790,11 +790,9 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.h
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1
; GFX11-TRUE16-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 117af95..74552a5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -29177,870 +29177,1844 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: .LBB19_4:
; GFX9-NEXT: s_branch .LBB19_2
;
-; GFX11-LABEL: bitcast_v64bf16_to_v32i32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288
-; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284
-; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280
-; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276
-; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272
-; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268
-; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264
-; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260
-; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256
-; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252
-; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248
-; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244
-; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240
-; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236
-; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232
-; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228
-; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224
-; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220
-; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216
-; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212
-; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208
-; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204
-; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200
-; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196
-; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192
-; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188
-; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184
-; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180
-; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176
-; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172
-; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168
-; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160
-; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156
-; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152
-; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148
-; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144
-; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140
-; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136
-; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132
-; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128
-; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124
-; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120
-; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116
-; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112
-; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108
-; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104
-; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100
-; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96
-; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92
-; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88
-; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84
-; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80
-; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76
-; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72
-; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68
-; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64
-; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60
-; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56
-; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52
-; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48
-; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44
-; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40
-; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32
-; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28
-; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24
-; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20
-; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16
-; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12
-; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8
-; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4
-; GFX11-NEXT: scratch_store_b32 off, v184, s32
-; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
-; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
-; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
-; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
-; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
-; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
-; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
-; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB19_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
-; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
-; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
-; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
-; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
-; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
-; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
-; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB19_3
-; GFX11-NEXT: .LBB19_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s5, s27, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s4, s27, 16
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s6, s26, 16
-; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT: s_lshl_b32 s7, s25, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000
-; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
-; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_lshl_b32 s4, s24, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
-; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
-; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s23, 16
-; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s22, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_lshl_b32 s4, s21, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1
-; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: s_lshl_b32 s4, s20, 16
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12
-; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s19, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13
-; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: s_lshl_b32 s4, s18, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18
-; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1
-; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT: s_lshl_b32 s4, s17, 16
-; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19
-; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
-; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s16, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
-; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25
-; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20
-; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23
-; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24
-; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3
-; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1
-; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24
-; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25
-; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31
-; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25
-; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
-; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31
-; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26
-; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31
-; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29
-; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26
-; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30
-; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32
-; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32
-; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
-; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30
-; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29
-; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31
-; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1
-; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169
-; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34
-; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
-; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36
-; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35
-; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32
-; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
-; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31
-; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175
-; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175
-; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38
-; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37
-; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38
-; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1
-; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
-; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39
-; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49
-; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48
-; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52
-; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38
-; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31
-; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36
-; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37
-; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39
-; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25
-; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48
-; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27
-; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30
-; GFX11-NEXT: .LBB19_3: ; %end
-; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
-; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
-; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
-; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
-; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
-; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
-; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
-; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
-; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
-; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v184, off, s32
-; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12
-; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16
-; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20
-; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24
-; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28
-; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32
-; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36
-; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40
-; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44
-; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48
-; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52
-; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56
-; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60
-; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64
-; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68
-; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72
-; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76
-; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80
-; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84
-; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88
-; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92
-; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96
-; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100
-; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104
-; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108
-; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112
-; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116
-; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120
-; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128
-; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132
-; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136
-; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140
-; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144
-; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148
-; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152
-; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156
-; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160
-; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164
-; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168
-; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172
-; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176
-; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180
-; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184
-; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188
-; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192
-; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196
-; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200
-; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204
-; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208
-; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212
-; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216
-; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220
-; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224
-; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228
-; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232
-; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236
-; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240
-; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244
-; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248
-; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256
-; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260
-; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264
-; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268
-; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272
-; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276
-; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280
-; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288
-; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
-; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
-; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
-; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
-; GFX11-NEXT: v_mov_b32_e32 v28, v182
-; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB19_4:
-; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
-; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
-; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
-; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
-; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
-; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
-; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
-; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
-; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
-; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
-; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
-; GFX11-NEXT: s_branch .LBB19_2
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28
+; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v167, v13 :: v_dual_mov_b32 v176, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v177, v11 :: v_dual_mov_b32 v178, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v179, v9 :: v_dual_mov_b32 v180, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, v7 :: v_dual_mov_b32 v182, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v183, v5 :: v_dual_mov_b32 v168, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v169, v3 :: v_dual_mov_b32 v170, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, s28 :: v_dual_mov_b32 v173, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v135, s0 :: v_dual_mov_b32 v134, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v132, s2 :: v_dual_mov_b32 v129, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v125, s16 :: v_dual_mov_b32 v120, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s18 :: v_dual_mov_b32 v107, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v99, s20 :: v_dual_mov_b32 v90, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s22 :: v_dual_mov_b32 v69, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v57, s24 :: v_dual_mov_b32 v44, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3
+; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v7, v2 :: v_dual_add_nc_u32 v7, v8, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v1.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v120.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v167
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v167
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v176
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v176
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v177
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v177
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v178
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v178
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v179
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v179
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v180
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v180
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v181
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v181
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v182
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v182
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v183
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v183
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v168
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v168
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v168, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v168.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v169
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v169
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v169, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v169.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v170
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v170
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v170, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v170.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v171
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v171
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v171, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v171.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v172
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v172
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v172, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v172.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v173
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v173
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v173, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v173.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v174
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v174
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v174, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v174.h, v0.l
+; GFX11-TRUE16-NEXT: .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v125 :: v_dual_mov_b32 v5, v120
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v114 :: v_dual_mov_b32 v7, v107
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v99 :: v_dual_mov_b32 v9, v90
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v57 :: v_dual_mov_b32 v13, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v17, v173
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v174 :: v_dual_mov_b32 v19, v171
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:280
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v135 :: v_dual_mov_b32 v1, v134
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v132 :: v_dual_mov_b32 v3, v129
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v80 :: v_dual_mov_b32 v11, v69
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v182 :: v_dual_mov_b32 v27, v179
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v180 :: v_dual_mov_b32 v29, v177
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v178 :: v_dual_mov_b32 v31, v167
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v176
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB19_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166
+; GFX11-TRUE16-NEXT: s_branch .LBB19_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32i32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36
+; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3
+; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49
+; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30
+; GFX11-FAKE16-NEXT: .LBB19_3: ; %end
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252
+; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB19_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
+; GFX11-FAKE16-NEXT: s_branch .LBB19_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -66313,870 +67287,1844 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX9-NEXT: .LBB43_4:
; GFX9-NEXT: s_branch .LBB43_2
;
-; GFX11-LABEL: bitcast_v64bf16_to_v32f32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288
-; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284
-; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280
-; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276
-; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272
-; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268
-; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264
-; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260
-; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256
-; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252
-; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248
-; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244
-; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240
-; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236
-; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232
-; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228
-; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224
-; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220
-; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216
-; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212
-; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208
-; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204
-; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200
-; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196
-; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192
-; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188
-; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184
-; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180
-; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176
-; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172
-; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168
-; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160
-; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156
-; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152
-; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148
-; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144
-; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140
-; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136
-; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132
-; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128
-; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124
-; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120
-; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116
-; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112
-; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108
-; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104
-; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100
-; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96
-; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92
-; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88
-; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84
-; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80
-; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76
-; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72
-; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68
-; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64
-; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60
-; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56
-; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52
-; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48
-; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44
-; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40
-; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32
-; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28
-; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24
-; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20
-; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16
-; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12
-; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8
-; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4
-; GFX11-NEXT: scratch_store_b32 off, v184, s32
-; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
-; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
-; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
-; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
-; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
-; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
-; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
-; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB43_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
-; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
-; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
-; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
-; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
-; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
-; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
-; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB43_3
-; GFX11-NEXT: .LBB43_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s5, s27, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s4, s27, 16
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s6, s26, 16
-; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT: s_lshl_b32 s7, s25, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000
-; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
-; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_lshl_b32 s4, s24, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
-; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
-; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s23, 16
-; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s22, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_lshl_b32 s4, s21, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1
-; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: s_lshl_b32 s4, s20, 16
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12
-; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s19, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13
-; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: s_lshl_b32 s4, s18, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18
-; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1
-; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT: s_lshl_b32 s4, s17, 16
-; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19
-; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
-; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s16, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
-; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25
-; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20
-; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23
-; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24
-; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3
-; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1
-; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24
-; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25
-; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31
-; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25
-; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
-; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31
-; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26
-; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31
-; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29
-; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26
-; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30
-; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32
-; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32
-; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
-; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30
-; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29
-; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31
-; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1
-; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169
-; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34
-; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
-; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36
-; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35
-; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32
-; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
-; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31
-; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175
-; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175
-; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38
-; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37
-; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38
-; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1
-; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
-; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39
-; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49
-; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48
-; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52
-; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38
-; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31
-; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36
-; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37
-; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39
-; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25
-; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48
-; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27
-; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30
-; GFX11-NEXT: .LBB43_3: ; %end
-; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
-; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
-; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
-; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
-; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
-; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
-; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
-; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
-; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
-; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v184, off, s32
-; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12
-; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16
-; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20
-; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24
-; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28
-; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32
-; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36
-; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40
-; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44
-; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48
-; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52
-; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56
-; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60
-; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64
-; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68
-; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72
-; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76
-; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80
-; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84
-; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88
-; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92
-; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96
-; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100
-; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104
-; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108
-; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112
-; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116
-; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120
-; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128
-; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132
-; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136
-; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140
-; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144
-; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148
-; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152
-; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156
-; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160
-; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164
-; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168
-; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172
-; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176
-; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180
-; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184
-; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188
-; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192
-; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196
-; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200
-; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204
-; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208
-; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212
-; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216
-; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220
-; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224
-; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228
-; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232
-; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236
-; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240
-; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244
-; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248
-; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256
-; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260
-; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264
-; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268
-; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272
-; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276
-; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280
-; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288
-; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
-; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
-; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
-; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
-; GFX11-NEXT: v_mov_b32_e32 v28, v182
-; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB43_4:
-; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
-; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
-; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
-; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
-; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
-; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
-; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
-; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
-; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
-; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
-; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
-; GFX11-NEXT: s_branch .LBB43_2
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28
+; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v167, v13 :: v_dual_mov_b32 v176, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v177, v11 :: v_dual_mov_b32 v178, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v179, v9 :: v_dual_mov_b32 v180, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, v7 :: v_dual_mov_b32 v182, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v183, v5 :: v_dual_mov_b32 v168, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v169, v3 :: v_dual_mov_b32 v170, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, s28 :: v_dual_mov_b32 v173, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v135, s0 :: v_dual_mov_b32 v134, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v132, s2 :: v_dual_mov_b32 v129, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v125, s16 :: v_dual_mov_b32 v120, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s18 :: v_dual_mov_b32 v107, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v99, s20 :: v_dual_mov_b32 v90, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s22 :: v_dual_mov_b32 v69, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v57, s24 :: v_dual_mov_b32 v44, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3
+; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v7, v2 :: v_dual_add_nc_u32 v7, v8, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v1.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v120.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v167
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v167
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v176
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v176
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v177
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v177
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v178
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v178
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v179
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v179
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v180
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v180
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v181
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v181
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v182
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v182
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v183
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v183
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v168
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v168
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v168, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v168.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v169
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v169
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v169, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v169.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v170
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v170
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v170, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v170.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v171
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v171
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v171, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v171.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v172
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v172
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v172, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v172.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v173
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v173
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v173, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v173.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v174
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v174
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v174, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v174.h, v0.l
+; GFX11-TRUE16-NEXT: .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v125 :: v_dual_mov_b32 v5, v120
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v114 :: v_dual_mov_b32 v7, v107
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v99 :: v_dual_mov_b32 v9, v90
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v57 :: v_dual_mov_b32 v13, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v17, v173
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v174 :: v_dual_mov_b32 v19, v171
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:280
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v135 :: v_dual_mov_b32 v1, v134
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v132 :: v_dual_mov_b32 v3, v129
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v80 :: v_dual_mov_b32 v11, v69
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v182 :: v_dual_mov_b32 v27, v179
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v180 :: v_dual_mov_b32 v29, v177
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v178 :: v_dual_mov_b32 v31, v167
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v176
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB43_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166
+; GFX11-TRUE16-NEXT: s_branch .LBB43_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32f32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36
+; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3
+; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49
+; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30
+; GFX11-FAKE16-NEXT: .LBB43_3: ; %end
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252
+; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB43_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
+; GFX11-FAKE16-NEXT: s_branch .LBB43_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -101184,870 +103132,1844 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: .LBB63_4:
; GFX9-NEXT: s_branch .LBB63_2
;
-; GFX11-LABEL: bitcast_v64bf16_to_v16i64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288
-; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284
-; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280
-; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276
-; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272
-; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268
-; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264
-; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260
-; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256
-; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252
-; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248
-; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244
-; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240
-; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236
-; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232
-; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228
-; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224
-; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220
-; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216
-; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212
-; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208
-; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204
-; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200
-; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196
-; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192
-; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188
-; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184
-; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180
-; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176
-; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172
-; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168
-; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160
-; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156
-; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152
-; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148
-; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144
-; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140
-; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136
-; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132
-; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128
-; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124
-; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120
-; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116
-; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112
-; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108
-; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104
-; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100
-; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96
-; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92
-; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88
-; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84
-; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80
-; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76
-; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72
-; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68
-; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64
-; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60
-; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56
-; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52
-; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48
-; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44
-; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40
-; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32
-; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28
-; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24
-; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20
-; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16
-; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12
-; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8
-; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4
-; GFX11-NEXT: scratch_store_b32 off, v184, s32
-; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
-; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
-; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
-; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
-; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
-; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
-; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
-; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB63_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
-; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
-; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
-; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
-; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
-; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
-; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
-; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB63_3
-; GFX11-NEXT: .LBB63_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s5, s27, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s4, s27, 16
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s6, s26, 16
-; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT: s_lshl_b32 s7, s25, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000
-; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
-; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_lshl_b32 s4, s24, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
-; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
-; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s23, 16
-; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s22, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_lshl_b32 s4, s21, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1
-; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: s_lshl_b32 s4, s20, 16
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12
-; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s19, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13
-; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: s_lshl_b32 s4, s18, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18
-; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1
-; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT: s_lshl_b32 s4, s17, 16
-; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19
-; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
-; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s16, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
-; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25
-; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20
-; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23
-; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24
-; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3
-; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1
-; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24
-; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25
-; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31
-; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25
-; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
-; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31
-; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26
-; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31
-; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29
-; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26
-; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30
-; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32
-; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32
-; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
-; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30
-; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29
-; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31
-; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1
-; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169
-; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34
-; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
-; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36
-; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35
-; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32
-; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
-; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31
-; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175
-; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175
-; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38
-; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37
-; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38
-; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1
-; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
-; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39
-; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49
-; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48
-; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52
-; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38
-; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31
-; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36
-; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37
-; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39
-; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25
-; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48
-; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27
-; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30
-; GFX11-NEXT: .LBB63_3: ; %end
-; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
-; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
-; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
-; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
-; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
-; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
-; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
-; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
-; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
-; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v184, off, s32
-; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12
-; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16
-; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20
-; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24
-; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28
-; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32
-; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36
-; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40
-; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44
-; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48
-; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52
-; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56
-; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60
-; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64
-; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68
-; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72
-; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76
-; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80
-; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84
-; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88
-; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92
-; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96
-; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100
-; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104
-; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108
-; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112
-; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116
-; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120
-; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128
-; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132
-; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136
-; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140
-; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144
-; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148
-; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152
-; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156
-; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160
-; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164
-; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168
-; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172
-; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176
-; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180
-; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184
-; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188
-; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192
-; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196
-; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200
-; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204
-; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208
-; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212
-; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216
-; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220
-; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224
-; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228
-; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232
-; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236
-; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240
-; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244
-; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248
-; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256
-; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260
-; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264
-; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268
-; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272
-; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276
-; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280
-; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288
-; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
-; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
-; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
-; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
-; GFX11-NEXT: v_mov_b32_e32 v28, v182
-; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB63_4:
-; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
-; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
-; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
-; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
-; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
-; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
-; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
-; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
-; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
-; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
-; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
-; GFX11-NEXT: s_branch .LBB63_2
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28
+; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v167, v13 :: v_dual_mov_b32 v176, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v177, v11 :: v_dual_mov_b32 v178, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v179, v9 :: v_dual_mov_b32 v180, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, v7 :: v_dual_mov_b32 v182, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v183, v5 :: v_dual_mov_b32 v168, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v169, v3 :: v_dual_mov_b32 v170, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, s28 :: v_dual_mov_b32 v173, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB63_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v135, s0 :: v_dual_mov_b32 v134, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v132, s2 :: v_dual_mov_b32 v129, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v125, s16 :: v_dual_mov_b32 v120, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s18 :: v_dual_mov_b32 v107, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v99, s20 :: v_dual_mov_b32 v90, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s22 :: v_dual_mov_b32 v69, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v57, s24 :: v_dual_mov_b32 v44, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB63_3
+; GFX11-TRUE16-NEXT: .LBB63_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v7, v2 :: v_dual_add_nc_u32 v7, v8, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v1.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v120.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v167
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v167
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v176
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v176
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v177
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v177
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v178
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v178
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v179
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v179
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v180
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v180
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v181
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v181
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v182
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v182
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v183
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v183
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v168
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v168
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v168, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v168.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v169
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v169
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v169, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v169.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v170
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v170
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v170, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v170.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v171
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v171
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v171, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v171.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v172
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v172
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v172, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v172.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v173
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v173
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v173, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v173.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v174
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v174
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v174, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v174.h, v0.l
+; GFX11-TRUE16-NEXT: .LBB63_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v125 :: v_dual_mov_b32 v5, v120
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v114 :: v_dual_mov_b32 v7, v107
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v99 :: v_dual_mov_b32 v9, v90
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v57 :: v_dual_mov_b32 v13, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v17, v173
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v174 :: v_dual_mov_b32 v19, v171
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:280
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v135 :: v_dual_mov_b32 v1, v134
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v132 :: v_dual_mov_b32 v3, v129
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v80 :: v_dual_mov_b32 v11, v69
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v182 :: v_dual_mov_b32 v27, v179
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v180 :: v_dual_mov_b32 v29, v177
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v178 :: v_dual_mov_b32 v31, v167
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v176
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB63_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166
+; GFX11-TRUE16-NEXT: s_branch .LBB63_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16i64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36
+; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB63_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB63_3
+; GFX11-FAKE16-NEXT: .LBB63_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49
+; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30
+; GFX11-FAKE16-NEXT: .LBB63_3: ; %end
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252
+; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB63_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
+; GFX11-FAKE16-NEXT: s_branch .LBB63_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -136197,870 +139119,1844 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX9-NEXT: .LBB79_4:
; GFX9-NEXT: s_branch .LBB79_2
;
-; GFX11-LABEL: bitcast_v64bf16_to_v16f64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288
-; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284
-; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280
-; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276
-; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272
-; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268
-; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264
-; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260
-; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256
-; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252
-; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248
-; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244
-; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240
-; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236
-; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232
-; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228
-; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224
-; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220
-; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216
-; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212
-; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208
-; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204
-; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200
-; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196
-; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192
-; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188
-; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184
-; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180
-; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176
-; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172
-; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168
-; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160
-; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156
-; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152
-; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148
-; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144
-; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140
-; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136
-; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132
-; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128
-; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124
-; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120
-; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116
-; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112
-; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108
-; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104
-; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100
-; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96
-; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92
-; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88
-; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84
-; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80
-; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76
-; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72
-; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68
-; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64
-; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60
-; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56
-; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52
-; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48
-; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44
-; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40
-; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32
-; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28
-; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24
-; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20
-; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16
-; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12
-; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8
-; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4
-; GFX11-NEXT: scratch_store_b32 off, v184, s32
-; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
-; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
-; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
-; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
-; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
-; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
-; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
-; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB79_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
-; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
-; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
-; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
-; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
-; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
-; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
-; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB79_3
-; GFX11-NEXT: .LBB79_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s5, s27, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s4, s27, 16
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s6, s26, 16
-; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT: s_lshl_b32 s7, s25, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000
-; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
-; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_lshl_b32 s4, s24, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
-; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
-; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s23, 16
-; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s22, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_lshl_b32 s4, s21, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1
-; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: s_lshl_b32 s4, s20, 16
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12
-; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s19, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13
-; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: s_lshl_b32 s4, s18, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18
-; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1
-; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT: s_lshl_b32 s4, s17, 16
-; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19
-; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
-; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s16, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
-; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25
-; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20
-; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23
-; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24
-; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3
-; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1
-; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24
-; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25
-; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31
-; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25
-; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
-; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31
-; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26
-; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31
-; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29
-; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26
-; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30
-; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32
-; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32
-; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
-; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30
-; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29
-; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31
-; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1
-; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169
-; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34
-; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
-; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36
-; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35
-; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32
-; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
-; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31
-; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175
-; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175
-; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38
-; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37
-; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38
-; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1
-; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
-; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39
-; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49
-; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48
-; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52
-; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38
-; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31
-; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36
-; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37
-; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39
-; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25
-; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48
-; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27
-; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30
-; GFX11-NEXT: .LBB79_3: ; %end
-; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
-; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
-; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
-; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
-; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
-; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
-; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
-; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
-; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
-; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v184, off, s32
-; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12
-; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16
-; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20
-; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24
-; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28
-; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32
-; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36
-; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40
-; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44
-; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48
-; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52
-; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56
-; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60
-; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64
-; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68
-; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72
-; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76
-; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80
-; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84
-; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88
-; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92
-; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96
-; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100
-; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104
-; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108
-; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112
-; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116
-; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120
-; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128
-; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132
-; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136
-; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140
-; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144
-; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148
-; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152
-; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156
-; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160
-; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164
-; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168
-; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172
-; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176
-; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180
-; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184
-; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188
-; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192
-; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196
-; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200
-; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204
-; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208
-; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212
-; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216
-; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220
-; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224
-; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228
-; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232
-; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236
-; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240
-; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244
-; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248
-; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256
-; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260
-; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264
-; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268
-; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272
-; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276
-; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280
-; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288
-; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
-; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
-; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
-; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
-; GFX11-NEXT: v_mov_b32_e32 v28, v182
-; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB79_4:
-; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
-; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
-; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
-; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
-; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
-; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
-; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
-; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
-; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
-; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
-; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
-; GFX11-NEXT: s_branch .LBB79_2
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28
+; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v167, v13 :: v_dual_mov_b32 v176, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v177, v11 :: v_dual_mov_b32 v178, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v179, v9 :: v_dual_mov_b32 v180, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, v7 :: v_dual_mov_b32 v182, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v183, v5 :: v_dual_mov_b32 v168, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v169, v3 :: v_dual_mov_b32 v170, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, s28 :: v_dual_mov_b32 v173, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB79_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v135, s0 :: v_dual_mov_b32 v134, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v132, s2 :: v_dual_mov_b32 v129, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v125, s16 :: v_dual_mov_b32 v120, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s18 :: v_dual_mov_b32 v107, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v99, s20 :: v_dual_mov_b32 v90, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s22 :: v_dual_mov_b32 v69, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v57, s24 :: v_dual_mov_b32 v44, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB79_3
+; GFX11-TRUE16-NEXT: .LBB79_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v7, v2 :: v_dual_add_nc_u32 v7, v8, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v1.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v120.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v167
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v167
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v176
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v176
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v177
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v177
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v178
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v178
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v179
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v179
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v180
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v180
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v181
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v181
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v182
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v182
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v183
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v183
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v168
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v168
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v168, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v168.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v169
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v169
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v169, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v169.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v170
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v170
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v170, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v170.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v171
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v171
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v171, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v171.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v172
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v172
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v172, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v172.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v173
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v173
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v173, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v173.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v174
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v174
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v174, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v174.h, v0.l
+; GFX11-TRUE16-NEXT: .LBB79_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v125 :: v_dual_mov_b32 v5, v120
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v114 :: v_dual_mov_b32 v7, v107
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v99 :: v_dual_mov_b32 v9, v90
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v57 :: v_dual_mov_b32 v13, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v17, v173
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v174 :: v_dual_mov_b32 v19, v171
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:280
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v135 :: v_dual_mov_b32 v1, v134
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v132 :: v_dual_mov_b32 v3, v129
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v80 :: v_dual_mov_b32 v11, v69
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v182 :: v_dual_mov_b32 v27, v179
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v180 :: v_dual_mov_b32 v29, v177
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v178 :: v_dual_mov_b32 v31, v167
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v176
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB79_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166
+; GFX11-TRUE16-NEXT: s_branch .LBB79_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16f64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36
+; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB79_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB79_3
+; GFX11-FAKE16-NEXT: .LBB79_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49
+; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30
+; GFX11-FAKE16-NEXT: .LBB79_3: ; %end
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252
+; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB79_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
+; GFX11-FAKE16-NEXT: s_branch .LBB79_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -154174,9 +158070,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v5, 0xffff, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_and_b32 v1, 0xff, v35
; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
@@ -154192,6 +158089,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
@@ -154202,201 +158100,169 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v68
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v118
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v67
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v68
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v69
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v70
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v48
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v7, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v81
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v80
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v55
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v2, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v51
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v83
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v3, v86
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v84
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v85
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v10, v97
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v87
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v99
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v114
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v98
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v97
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v102
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v101
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v113
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v101
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v116
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v14, v128
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v114
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v112
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v102
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v130
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v133
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v14, v132
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v116
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v128
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v134
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v132
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v133
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v130
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v161
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v129
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v147
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v129
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v161
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v166
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v144
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v134
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v147
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v167
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v166
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v144
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v149
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v177
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v180
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v149
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v165
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v42
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v41
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v42
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v115
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v45
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v45
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v44
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v119
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v56
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v59
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v56
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v60
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v61
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v60
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v63
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v62
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v63
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v62
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v73
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v72
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v160
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v176
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v75
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v74
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v75
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v77
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v76
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v77
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v78
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v79
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v88
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v89
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v40
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v91
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v90
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v91
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v92
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v93
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v57
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v93
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB89_3
; GFX11-TRUE16-NEXT: .LBB89_2: ; %cmp.true
@@ -154436,57 +158302,59 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v58
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v91, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v90, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v40
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v43, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v90, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v183
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v89, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v88, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v78, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v79, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v179
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v179, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v77, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
@@ -154495,7 +158363,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v164
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v163
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v76, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -154506,18 +158374,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v74, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v73, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v150, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v72, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v146
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v145
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v63, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -154525,13 +158393,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v131
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v62, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v60, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v61, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v119
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v59, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
@@ -154540,29 +158408,29 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v115
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v165
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v56, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v162
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v45, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v44, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v42, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v41, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v180, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -154576,8 +158444,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v161, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v118
@@ -154585,167 +158453,141 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v147, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v99
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v130, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v103
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v98
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v54
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v103, 0x300, v0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v128, v3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v99
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v54
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v39
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v113, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 3, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v113, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v128, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v100
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v101, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v102, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v102, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v97, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v55
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v96
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v97, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v100
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xff, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v87, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v51
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v86, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v85, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v84, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v83, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v48
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v39
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v87, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v82, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v81, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v71, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v80, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v86, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v85, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v84, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v49
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v83, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v82, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v81, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v71, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v80, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v69, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v112, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v68, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v67, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v66, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v32
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v65, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v70, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v69, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v35
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v112, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v67, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v68, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, v66, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v116
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v135
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v182
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v65, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v34
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v36.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v33.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v179, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v103.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v114.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.h, v129.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.h, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.h, v144.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.h, v145.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v115.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v119.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.h, v131.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.h, v135.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.h, v150.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.h, v160.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.h, v179.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.h, v181.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l
; GFX11-TRUE16-NEXT: .LBB89_3: ; %end
; GFX11-TRUE16-NEXT: s_clause 0x1e
; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
@@ -168348,1575 +172190,3138 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v64bf16_to_v128i8_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: scratch_store_b32 off, v40, s32
-; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
-; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:8
-; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:12
-; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v41, s96, 0
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
-; GFX11-NEXT: v_readfirstlane_b32 s72, v1
-; GFX11-NEXT: v_readfirstlane_b32 s73, v2
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-NEXT: v_writelane_b32 v41, s97, 1
-; GFX11-NEXT: v_readfirstlane_b32 s62, v3
-; GFX11-NEXT: v_readfirstlane_b32 s63, v4
-; GFX11-NEXT: v_readfirstlane_b32 s60, v5
-; GFX11-NEXT: v_writelane_b32 v40, s34, 2
-; GFX11-NEXT: v_writelane_b32 v41, s98, 2
-; GFX11-NEXT: v_readfirstlane_b32 s61, v6
-; GFX11-NEXT: v_readfirstlane_b32 s58, v7
-; GFX11-NEXT: v_readfirstlane_b32 s59, v8
-; GFX11-NEXT: v_writelane_b32 v40, s35, 3
-; GFX11-NEXT: v_writelane_b32 v41, s99, 3
-; GFX11-NEXT: v_readfirstlane_b32 s56, v9
-; GFX11-NEXT: v_readfirstlane_b32 s57, v10
-; GFX11-NEXT: v_readfirstlane_b32 s46, v11
-; GFX11-NEXT: v_writelane_b32 v40, s36, 4
-; GFX11-NEXT: v_writelane_b32 v41, s100, 4
-; GFX11-NEXT: v_readfirstlane_b32 s47, v12
-; GFX11-NEXT: v_readfirstlane_b32 s44, v13
-; GFX11-NEXT: v_readfirstlane_b32 s45, v14
-; GFX11-NEXT: v_writelane_b32 v40, s37, 5
-; GFX11-NEXT: v_writelane_b32 v41, s101, 5
-; GFX11-NEXT: s_mov_b32 vcc_hi, 0
-; GFX11-NEXT: s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
-; GFX11-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; GFX11-NEXT: v_writelane_b32 v40, s38, 6
-; GFX11-NEXT: v_writelane_b32 v41, s102, 6
-; GFX11-NEXT: v_writelane_b32 v40, s39, 7
-; GFX11-NEXT: v_writelane_b32 v41, s103, 7
-; GFX11-NEXT: v_writelane_b32 v40, s48, 8
-; GFX11-NEXT: v_writelane_b32 v41, s104, 8
-; GFX11-NEXT: v_writelane_b32 v40, s49, 9
-; GFX11-NEXT: v_writelane_b32 v40, s50, 10
-; GFX11-NEXT: v_writelane_b32 v40, s51, 11
-; GFX11-NEXT: v_writelane_b32 v40, s52, 12
-; GFX11-NEXT: v_writelane_b32 v40, s53, 13
-; GFX11-NEXT: v_writelane_b32 v40, s54, 14
-; GFX11-NEXT: v_writelane_b32 v40, s55, 15
-; GFX11-NEXT: v_writelane_b32 v40, s64, 16
-; GFX11-NEXT: v_writelane_b32 v40, s65, 17
-; GFX11-NEXT: v_writelane_b32 v40, s66, 18
-; GFX11-NEXT: v_writelane_b32 v40, s67, 19
-; GFX11-NEXT: v_writelane_b32 v40, s68, 20
-; GFX11-NEXT: v_writelane_b32 v40, s69, 21
-; GFX11-NEXT: v_writelane_b32 v40, s70, 22
-; GFX11-NEXT: v_writelane_b32 v40, s71, 23
-; GFX11-NEXT: v_writelane_b32 v40, s80, 24
-; GFX11-NEXT: v_writelane_b32 v40, s81, 25
-; GFX11-NEXT: v_writelane_b32 v40, s82, 26
-; GFX11-NEXT: v_writelane_b32 v40, s83, 27
-; GFX11-NEXT: v_writelane_b32 v40, s84, 28
-; GFX11-NEXT: v_writelane_b32 v40, s85, 29
-; GFX11-NEXT: v_writelane_b32 v40, s86, 30
-; GFX11-NEXT: v_writelane_b32 v40, s87, 31
-; GFX11-NEXT: s_cbranch_scc0 .LBB91_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s4, s27, 24
-; GFX11-NEXT: s_lshr_b64 s[12:13], s[26:27], 24
-; GFX11-NEXT: v_writelane_b32 v43, s4, 15
-; GFX11-NEXT: s_lshr_b32 s4, s27, 16
-; GFX11-NEXT: s_lshr_b32 s99, s2, 16
-; GFX11-NEXT: s_lshr_b32 s100, s2, 8
-; GFX11-NEXT: s_lshr_b32 s101, s1, 24
-; GFX11-NEXT: v_writelane_b32 v43, s4, 14
-; GFX11-NEXT: s_lshr_b32 s4, s27, 8
-; GFX11-NEXT: s_lshr_b32 s11, s1, 16
-; GFX11-NEXT: s_lshr_b32 s102, s1, 8
-; GFX11-NEXT: s_lshr_b32 s103, s0, 16
-; GFX11-NEXT: v_writelane_b32 v43, s4, 16
-; GFX11-NEXT: s_lshr_b32 s4, s26, 16
-; GFX11-NEXT: s_lshr_b32 s104, s0, 8
-; GFX11-NEXT: s_lshr_b32 s85, s45, 24
-; GFX11-NEXT: s_lshr_b32 s10, s45, 16
-; GFX11-NEXT: v_writelane_b32 v43, s4, 17
-; GFX11-NEXT: s_lshr_b32 s4, s26, 8
-; GFX11-NEXT: s_lshr_b32 s5, s45, 8
-; GFX11-NEXT: s_lshr_b32 s87, s44, 16
-; GFX11-NEXT: s_lshr_b32 s86, s44, 8
-; GFX11-NEXT: v_writelane_b32 v43, s4, 18
-; GFX11-NEXT: s_lshr_b32 s4, s25, 24
-; GFX11-NEXT: s_lshr_b32 s81, s47, 24
-; GFX11-NEXT: s_lshr_b32 s98, s47, 16
-; GFX11-NEXT: s_lshr_b32 s84, s47, 8
-; GFX11-NEXT: v_writelane_b32 v43, s4, 19
-; GFX11-NEXT: s_lshr_b32 s4, s25, 16
-; GFX11-NEXT: s_lshr_b32 s48, s46, 8
-; GFX11-NEXT: s_lshr_b32 s70, s57, 24
-; GFX11-NEXT: s_lshr_b32 s97, s57, 16
-; GFX11-NEXT: v_writelane_b32 v43, s4, 13
-; GFX11-NEXT: s_lshr_b32 s4, s25, 8
-; GFX11-NEXT: s_lshr_b32 s80, s57, 8
-; GFX11-NEXT: s_lshr_b32 s83, s56, 16
-; GFX11-NEXT: s_lshr_b32 s82, s56, 8
-; GFX11-NEXT: v_writelane_b32 v43, s4, 20
-; GFX11-NEXT: s_lshr_b32 s4, s24, 16
-; GFX11-NEXT: s_lshr_b32 s66, s59, 24
-; GFX11-NEXT: s_lshr_b32 s9, s59, 16
-; GFX11-NEXT: s_lshr_b32 s69, s59, 8
-; GFX11-NEXT: v_writelane_b32 v43, s4, 21
-; GFX11-NEXT: s_lshr_b32 s4, s24, 8
-; GFX11-NEXT: s_lshr_b32 s71, s58, 16
-; GFX11-NEXT: s_lshr_b32 s39, s58, 8
-; GFX11-NEXT: s_lshr_b32 s55, s61, 24
-; GFX11-NEXT: v_writelane_b32 v43, s4, 22
-; GFX11-NEXT: s_lshr_b32 s4, s23, 24
-; GFX11-NEXT: s_lshr_b32 s8, s61, 16
-; GFX11-NEXT: s_lshr_b32 s65, s61, 8
-; GFX11-NEXT: s_lshr_b32 s68, s60, 16
-; GFX11-NEXT: v_writelane_b32 v43, s4, 23
-; GFX11-NEXT: s_lshr_b32 s4, s23, 16
-; GFX11-NEXT: s_lshr_b32 s67, s60, 8
-; GFX11-NEXT: s_lshr_b32 s51, s63, 24
-; GFX11-NEXT: s_lshr_b32 s96, s63, 16
-; GFX11-NEXT: v_writelane_b32 v43, s4, 12
-; GFX11-NEXT: s_lshr_b32 s4, s23, 8
-; GFX11-NEXT: s_lshr_b32 s54, s63, 8
-; GFX11-NEXT: s_lshr_b32 s38, s62, 16
-; GFX11-NEXT: s_lshr_b32 s64, s62, 8
-; GFX11-NEXT: v_writelane_b32 v43, s4, 24
-; GFX11-NEXT: s_lshr_b32 s4, s22, 16
-; GFX11-NEXT: s_lshr_b32 s36, s73, 24
-; GFX11-NEXT: s_lshr_b32 s7, s73, 16
-; GFX11-NEXT: s_lshr_b32 s50, s73, 8
-; GFX11-NEXT: v_writelane_b32 v43, s4, 25
-; GFX11-NEXT: s_lshr_b32 s4, s22, 8
-; GFX11-NEXT: s_lshr_b32 s53, s72, 16
-; GFX11-NEXT: s_lshr_b32 s52, s72, 8
-; GFX11-NEXT: s_lshr_b32 s34, s29, 24
-; GFX11-NEXT: v_writelane_b32 v43, s4, 26
-; GFX11-NEXT: s_lshr_b32 s4, s21, 24
-; GFX11-NEXT: s_lshr_b32 s6, s29, 16
-; GFX11-NEXT: s_lshr_b32 s35, s29, 8
-; GFX11-NEXT: s_lshr_b32 s37, s28, 16
-; GFX11-NEXT: v_writelane_b32 v43, s4, 27
-; GFX11-NEXT: s_lshr_b32 s4, s21, 16
-; GFX11-NEXT: s_lshr_b32 s49, s28, 8
-; GFX11-NEXT: s_lshr_b64 s[14:15], s[16:17], 24
-; GFX11-NEXT: s_lshr_b64 s[40:41], s[2:3], 24
-; GFX11-NEXT: v_writelane_b32 v43, s4, 11
-; GFX11-NEXT: s_lshr_b32 s4, s21, 8
-; GFX11-NEXT: s_lshr_b64 s[42:43], s[0:1], 24
-; GFX11-NEXT: s_lshr_b64 s[74:75], s[44:45], 24
-; GFX11-NEXT: s_lshr_b64 s[76:77], s[46:47], 24
-; GFX11-NEXT: v_writelane_b32 v43, s4, 28
-; GFX11-NEXT: s_lshr_b32 s4, s20, 16
-; GFX11-NEXT: s_lshr_b64 s[78:79], s[56:57], 24
-; GFX11-NEXT: s_lshr_b64 s[88:89], s[58:59], 24
-; GFX11-NEXT: s_lshr_b64 s[90:91], s[60:61], 24
-; GFX11-NEXT: v_writelane_b32 v43, s4, 29
-; GFX11-NEXT: s_lshr_b32 s4, s20, 8
-; GFX11-NEXT: s_lshr_b64 s[92:93], s[62:63], 24
-; GFX11-NEXT: s_lshr_b64 s[94:95], s[72:73], 24
-; GFX11-NEXT: s_lshr_b64 s[30:31], s[28:29], 24
-; GFX11-NEXT: v_writelane_b32 v43, s4, 30
-; GFX11-NEXT: s_lshr_b32 s4, s19, 24
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_writelane_b32 v43, s4, 31
-; GFX11-NEXT: s_lshr_b32 s4, s19, 16
-; GFX11-NEXT: v_writelane_b32 v43, s4, 10
-; GFX11-NEXT: s_lshr_b32 s4, s19, 8
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_writelane_b32 v42, s4, 0
-; GFX11-NEXT: s_lshr_b32 s4, s18, 16
-; GFX11-NEXT: v_writelane_b32 v42, s4, 1
-; GFX11-NEXT: s_lshr_b32 s4, s18, 8
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_writelane_b32 v42, s4, 2
-; GFX11-NEXT: s_lshr_b32 s4, s17, 24
-; GFX11-NEXT: v_writelane_b32 v42, s4, 3
-; GFX11-NEXT: s_lshr_b32 s4, s17, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_writelane_b32 v43, s4, 9
-; GFX11-NEXT: s_lshr_b32 s4, s17, 8
-; GFX11-NEXT: v_writelane_b32 v42, s4, 4
-; GFX11-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_writelane_b32 v42, s4, 5
-; GFX11-NEXT: s_lshr_b32 s4, s16, 8
-; GFX11-NEXT: v_writelane_b32 v42, s4, 6
-; GFX11-NEXT: s_lshr_b32 s4, s3, 24
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_writelane_b32 v42, s4, 7
-; GFX11-NEXT: s_lshr_b32 s4, s3, 16
-; GFX11-NEXT: v_writelane_b32 v43, s4, 8
-; GFX11-NEXT: s_lshr_b32 s4, s3, 8
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_writelane_b32 v42, s4, 8
-; GFX11-NEXT: s_lshr_b32 s4, s46, 16
-; GFX11-NEXT: v_writelane_b32 v43, s12, 6
-; GFX11-NEXT: v_writelane_b32 v43, s13, 7
-; GFX11-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
-; GFX11-NEXT: v_writelane_b32 v43, s12, 4
-; GFX11-NEXT: v_writelane_b32 v43, s13, 5
-; GFX11-NEXT: s_lshr_b64 s[12:13], s[22:23], 24
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_writelane_b32 v43, s12, 2
-; GFX11-NEXT: v_writelane_b32 v43, s13, 3
-; GFX11-NEXT: s_lshr_b64 s[12:13], s[20:21], 24
-; GFX11-NEXT: v_writelane_b32 v43, s12, 0
-; GFX11-NEXT: v_writelane_b32 v43, s13, 1
-; GFX11-NEXT: s_lshr_b64 s[12:13], s[18:19], 24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
-; GFX11-NEXT: s_cbranch_vccnz .LBB91_4
-; GFX11-NEXT: .LBB91_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s4, s29, 0xffff0000
-; GFX11-NEXT: s_and_b32 s14, s47, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
-; GFX11-NEXT: s_and_b32 s4, s1, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s15, s47, 16
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s6, s29, 16
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s6
-; GFX11-NEXT: s_and_b32 s8, s45, 0xffff0000
-; GFX11-NEXT: v_readfirstlane_b32 s47, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: s_lshl_b32 s7, s45, 16
-; GFX11-NEXT: s_and_b32 s78, s28, 0xffff0000
-; GFX11-NEXT: s_bfe_u32 s6, s47, 0x10010
-; GFX11-NEXT: s_lshl_b32 s79, s28, 16
-; GFX11-NEXT: s_add_i32 s45, s6, s47
-; GFX11-NEXT: s_and_b32 s5, s73, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s77, s73, 16
-; GFX11-NEXT: s_and_b32 s75, s72, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s76, s72, 16
-; GFX11-NEXT: s_and_b32 s11, s63, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s74, s63, 16
-; GFX11-NEXT: s_and_b32 s72, s62, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s73, s62, 16
-; GFX11-NEXT: s_and_b32 s63, s61, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s62, s61, 16
-; GFX11-NEXT: s_and_b32 s61, s60, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s60, s60, 16
-; GFX11-NEXT: s_and_b32 s41, s59, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s40, s59, 16
-; GFX11-NEXT: s_and_b32 s28, s58, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s29, s58, 16
-; GFX11-NEXT: s_and_b32 s13, s57, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s10, s57, 16
-; GFX11-NEXT: s_and_b32 s42, s56, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s43, s56, 16
-; GFX11-NEXT: s_and_b32 s12, s46, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s9, s46, 16
-; GFX11-NEXT: s_and_b32 s4, s44, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s6, s44, 16
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_bitset1_b32 s47, 22
-; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_cselect_b32 s44, s47, s45
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v2
-; GFX11-NEXT: s_lshr_b32 s58, s44, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s78
-; GFX11-NEXT: v_readfirstlane_b32 s1, v3
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s79
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_bfe_u32 s45, s1, 0x10010
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: s_add_i32 s45, s45, s1
-; GFX11-NEXT: s_bitset1_b32 s1, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s1, s1, s45
-; GFX11-NEXT: s_and_b32 s44, s0, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v2
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s44
-; GFX11-NEXT: v_bfe_u32 v5, v7, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v4, v6
-; GFX11-NEXT: s_lshr_b32 s1, s1, 16
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT: v_readfirstlane_b32 s44, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s45, s45, s44
-; GFX11-NEXT: s_bitset1_b32 s44, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v5, v7
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v6
-; GFX11-NEXT: s_cselect_b32 s44, s44, s45
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v21
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s5
-; GFX11-NEXT: v_readfirstlane_b32 s0, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s77
-; GFX11-NEXT: s_bfe_u32 s5, s0, 0x10010
-; GFX11-NEXT: v_lshl_or_b32 v7, v22, 16, v4
-; GFX11-NEXT: s_add_i32 s45, s5, s0
-; GFX11-NEXT: s_lshr_b32 s5, s44, 16
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_bitset1_b32 s0, 22
-; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s0, s0, s45
-; GFX11-NEXT: s_and_b32 s44, s3, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s44
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_bfe_u32 v1, v5, 16, 1
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v23
-; GFX11-NEXT: v_readfirstlane_b32 s44, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v5
-; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3
-; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v5
-; GFX11-NEXT: s_add_i32 s45, s45, s44
-; GFX11-NEXT: s_bitset1_b32 s44, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: s_cselect_b32 s44, s44, s45
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s3
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s76
-; GFX11-NEXT: s_lshr_b32 s59, s44, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s75
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_readfirstlane_b32 s3, v10
-; GFX11-NEXT: v_bfe_u32 v8, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v87, 24, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: s_bfe_u32 s45, s3, 0x10010
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_add_i32 s45, s45, s3
-; GFX11-NEXT: s_bitset1_b32 s3, 22
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s3, s3, s45
-; GFX11-NEXT: s_and_b32 s44, s2, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s44
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v8, v9
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v9
-; GFX11-NEXT: v_readfirstlane_b32 s44, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v24
-; GFX11-NEXT: s_add_i32 s45, s45, s44
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_bitset1_b32 s44, 22
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s74
-; GFX11-NEXT: v_lshl_or_b32 v14, v25, 16, v5
-; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s44, s44, s45
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s2
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v14
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_readfirstlane_b32 s2, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s11
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: s_bfe_u32 s11, s2, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_add_i32 s45, s11, s2
-; GFX11-NEXT: s_lshr_b32 s11, s44, 16
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_bitset1_b32 s2, 22
-; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s2, s2, s45
-; GFX11-NEXT: s_and_b32 s44, s17, 0xffff0000
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v26
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s44
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-NEXT: v_lshl_or_b32 v13, v2, 16, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT: v_readfirstlane_b32 s44, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v13
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_add_i32 s45, s45, s44
-; GFX11-NEXT: s_bitset1_b32 s44, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s44, s44, s45
-; GFX11-NEXT: s_lshl_b32 s17, s17, 16
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s73
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s17
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s72
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_readfirstlane_b32 s17, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_lshr_b32 s72, s44, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2
-; GFX11-NEXT: s_bfe_u32 s45, s17, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v27
-; GFX11-NEXT: s_add_i32 s45, s45, s17
-; GFX11-NEXT: s_bitset1_b32 s17, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT: v_lshl_or_b32 v16, v28, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX11-NEXT: s_cselect_b32 s17, s17, s45
-; GFX11-NEXT: s_and_b32 s44, s16, 0xffff0000
-; GFX11-NEXT: s_lshr_b32 s17, s17, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s63
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 24, v16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v2
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v5, v3
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v29
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v8, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s44
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_readfirstlane_b32 s44, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT: s_add_i32 s45, s45, s44
-; GFX11-NEXT: s_bitset1_b32 s44, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s44, s44, s45
-; GFX11-NEXT: s_lshl_b32 s16, s16, 16
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s16
-; GFX11-NEXT: s_lshr_b32 s46, s44, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_readfirstlane_b32 s16, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s62
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_bfe_u32 s45, s16, 0x10010
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s45, s45, s16
-; GFX11-NEXT: s_bitset1_b32 s16, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s16, s16, s45
-; GFX11-NEXT: s_and_b32 s44, s19, 0xffff0000
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s44
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v5
-; GFX11-NEXT: v_bfe_u32 v9, v4, 16, 1
-; GFX11-NEXT: s_lshr_b32 s16, s16, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s44, v10
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v4
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s60
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v4
-; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s61
-; GFX11-NEXT: s_add_i32 s45, s45, s44
-; GFX11-NEXT: s_bitset1_b32 s44, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s44, s44, s45
-; GFX11-NEXT: s_lshl_b32 s19, s19, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s19
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_bfe_u32 v9, v8, 16, 1
-; GFX11-NEXT: s_lshr_b32 s60, s44, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v1
-; GFX11-NEXT: v_readfirstlane_b32 s19, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_bfe_u32 v3, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v9, v8
-; GFX11-NEXT: s_bfe_u32 s45, s19, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v2
-; GFX11-NEXT: s_add_i32 s45, s45, s19
-; GFX11-NEXT: s_bitset1_b32 s19, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s19, s19, s45
-; GFX11-NEXT: s_and_b32 s44, s18, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s44
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: s_lshr_b32 s19, s19, 16
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s29
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s41
-; GFX11-NEXT: v_readfirstlane_b32 s41, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_pack_ll_b32_b16 s47, s17, s72
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v2
-; GFX11-NEXT: v_bfe_u32 v2, v3, 16, 1
-; GFX11-NEXT: s_bfe_u32 s44, s41, 0x10010
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_add_i32 s44, s44, s41
-; GFX11-NEXT: s_bitset1_b32 s41, 22
-; GFX11-NEXT: s_addk_i32 s44, 0x7fff
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s40
-; GFX11-NEXT: s_and_b32 s45, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s41, s41, s44
-; GFX11-NEXT: s_lshl_b32 s18, s18, 16
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v31
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s18
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX11-NEXT: v_lshl_or_b32 v18, v30, 16, v4
-; GFX11-NEXT: v_readfirstlane_b32 s18, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_lshl_or_b32 v17, v1, 16, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: s_bfe_u32 s40, s18, 0x10010
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s28
-; GFX11-NEXT: s_add_i32 s44, s40, s18
-; GFX11-NEXT: s_lshr_b32 s40, s41, 16
-; GFX11-NEXT: s_addk_i32 s44, 0x7fff
-; GFX11-NEXT: s_bitset1_b32 s18, 22
-; GFX11-NEXT: s_and_b32 s41, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s18, s18, s44
-; GFX11-NEXT: s_and_b32 s41, s21, 0xffff0000
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s41
-; GFX11-NEXT: v_bfe_u32 v2, v9, 16, 1
-; GFX11-NEXT: s_lshr_b32 s18, s18, 16
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s28, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v9
-; GFX11-NEXT: v_bfe_u32 v4, v8, 16, 1
-; GFX11-NEXT: v_bfe_u32 v5, v10, 16, 1
-; GFX11-NEXT: s_bfe_u32 s29, s28, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v1
-; GFX11-NEXT: s_add_i32 s29, s29, s28
-; GFX11-NEXT: s_bitset1_b32 s28, 22
-; GFX11-NEXT: s_addk_i32 s29, 0x7fff
-; GFX11-NEXT: s_and_b32 s41, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s28, s28, s29
-; GFX11-NEXT: s_lshl_b32 s21, s21, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s21
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_lshr_b32 s61, s28, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v5, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s44, s2, s11
-; GFX11-NEXT: v_readfirstlane_b32 s21, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v4, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: s_bfe_u32 s29, s21, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v2
-; GFX11-NEXT: s_add_i32 s29, s29, s21
-; GFX11-NEXT: s_bitset1_b32 s21, 22
-; GFX11-NEXT: s_addk_i32 s29, 0x7fff
-; GFX11-NEXT: s_and_b32 s28, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s21, s21, s29
-; GFX11-NEXT: s_and_b32 s28, s20, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s28
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v10
-; GFX11-NEXT: s_lshr_b32 s21, s21, 16
-; GFX11-NEXT: s_pack_ll_b32_b16 s45, s3, s59
-; GFX11-NEXT: s_pack_ll_b32_b16 s46, s16, s46
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s13
-; GFX11-NEXT: v_readfirstlane_b32 s13, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 24, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_bfe_u32 s28, s13, 0x10010
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v34
-; GFX11-NEXT: s_add_i32 s28, s28, s13
-; GFX11-NEXT: s_bitset1_b32 s13, 22
-; GFX11-NEXT: s_addk_i32 s28, 0x7fff
-; GFX11-NEXT: s_and_b32 s29, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s13, s13, s28
-; GFX11-NEXT: s_lshl_b32 s20, s20, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s20
-; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s10
-; GFX11-NEXT: v_lshl_or_b32 v20, v33, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v35
-; GFX11-NEXT: v_readfirstlane_b32 s20, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v19, v2, 16, v9
-; GFX11-NEXT: s_bfe_u32 s10, s20, 0x10010
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: s_add_i32 s28, s10, s20
-; GFX11-NEXT: s_lshr_b32 s10, s13, 16
-; GFX11-NEXT: s_addk_i32 s28, 0x7fff
-; GFX11-NEXT: s_bitset1_b32 s20, 22
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v3
-; GFX11-NEXT: s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_cselect_b32 s13, s20, s28
-; GFX11-NEXT: s_and_b32 s20, s23, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s42
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s20
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s43
-; GFX11-NEXT: v_readfirstlane_b32 s28, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_bfe_u32 s20, s28, 0x10010
-; GFX11-NEXT: v_bfe_u32 v4, v8, 16, 1
-; GFX11-NEXT: s_add_i32 s29, s20, s28
-; GFX11-NEXT: s_lshr_b32 s20, s13, 16
-; GFX11-NEXT: s_addk_i32 s29, 0x7fff
-; GFX11-NEXT: s_bitset1_b32 s28, 22
-; GFX11-NEXT: s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s13, s28, s29
-; GFX11-NEXT: s_lshl_b32 s23, s23, 16
-; GFX11-NEXT: v_bfe_u32 v5, v9, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s23
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v4, v8
-; GFX11-NEXT: s_lshr_b32 s62, s13, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v5, v9
-; GFX11-NEXT: v_readfirstlane_b32 s23, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: s_bfe_u32 s28, s23, 0x10010
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT: s_add_i32 s28, s28, s23
-; GFX11-NEXT: s_bitset1_b32 s23, 22
-; GFX11-NEXT: s_addk_i32 s28, 0x7fff
-; GFX11-NEXT: s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: s_cselect_b32 s13, s23, s28
-; GFX11-NEXT: s_and_b32 s23, s22, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s15
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s23
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s14
-; GFX11-NEXT: s_lshr_b32 s23, s13, 16
-; GFX11-NEXT: v_bfe_u32 v9, v8, 16, 1
-; GFX11-NEXT: v_readfirstlane_b32 s14, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_lshl_or_b32 v71, v37, 16, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s12
-; GFX11-NEXT: s_bfe_u32 s15, s14, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: s_add_i32 s15, s15, s14
-; GFX11-NEXT: s_bitset1_b32 s14, 22
-; GFX11-NEXT: s_addk_i32 s15, 0x7fff
-; GFX11-NEXT: s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s13, s14, s15
-; GFX11-NEXT: s_lshl_b32 s14, s22, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s14
-; GFX11-NEXT: v_bfe_u32 v1, v5, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v38
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v8
-; GFX11-NEXT: s_lshr_b32 s13, s13, 16
-; GFX11-NEXT: v_readfirstlane_b32 s14, v10
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v5
-; GFX11-NEXT: v_lshl_or_b32 v70, v2, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9
-; GFX11-NEXT: s_bfe_u32 s12, s14, 0x10010
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v8
-; GFX11-NEXT: s_add_i32 s12, s12, s14
-; GFX11-NEXT: s_bitset1_b32 s14, 22
-; GFX11-NEXT: s_addk_i32 s12, 0x7fff
-; GFX11-NEXT: s_and_b32 s15, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s12, s14, s12
-; GFX11-NEXT: s_and_b32 s14, s25, 0xffff0000
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s14
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s9
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s9, v10
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_lshr_b32 s22, s12, 16
-; GFX11-NEXT: v_bfe_u32 v3, v4, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: s_bfe_u32 s14, s9, 0x10010
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_add_i32 s14, s14, s9
-; GFX11-NEXT: s_bitset1_b32 s9, 22
-; GFX11-NEXT: s_addk_i32 s14, 0x7fff
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: s_and_b32 s12, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s9, s9, s14
-; GFX11-NEXT: s_lshl_b32 s12, s25, 16
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s8
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s12
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v3, v4
-; GFX11-NEXT: s_lshr_b32 s63, s9, 16
-; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX11-NEXT: v_readfirstlane_b32 s8, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v8
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v8
-; GFX11-NEXT: s_bfe_u32 s12, s8, 0x10010
-; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
-; GFX11-NEXT: s_add_i32 s12, s12, s8
-; GFX11-NEXT: s_bitset1_b32 s8, 22
-; GFX11-NEXT: s_addk_i32 s12, 0x7fff
-; GFX11-NEXT: s_and_b32 s9, vcc_lo, exec_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_cselect_b32 s8, s8, s12
-; GFX11-NEXT: s_and_b32 s9, s24, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_lshr_b32 s25, s8, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v10, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v12, v9
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s6
-; GFX11-NEXT: v_readfirstlane_b32 s7, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v9
-; GFX11-NEXT: s_pack_ll_b32_b16 s28, s0, s5
-; GFX11-NEXT: s_bfe_u32 s9, s7, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v3
-; GFX11-NEXT: s_add_i32 s9, s9, s7
-; GFX11-NEXT: s_bitset1_b32 s7, 22
-; GFX11-NEXT: s_addk_i32 s9, 0x7fff
-; GFX11-NEXT: s_and_b32 s8, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s7, s7, s9
-; GFX11-NEXT: s_lshl_b32 s8, s24, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v4, v8, 16, 1
-; GFX11-NEXT: s_lshr_b32 s12, s7, 16
-; GFX11-NEXT: v_readfirstlane_b32 s8, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v8
-; GFX11-NEXT: v_bfe_u32 v10, v12, 16, 1
-; GFX11-NEXT: s_bfe_u32 s4, s8, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v2
-; GFX11-NEXT: s_add_i32 s4, s4, s8
-; GFX11-NEXT: s_bitset1_b32 s8, 22
-; GFX11-NEXT: s_addk_i32 s4, 0x7fff
-; GFX11-NEXT: s_and_b32 s6, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s4, s8, s4
-; GFX11-NEXT: s_and_b32 s6, s27, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e64 v52, 0x40c00000, s6
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v12
-; GFX11-NEXT: s_lshr_b32 s24, s4, 16
-; GFX11-NEXT: v_readfirstlane_b32 s6, v52
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-NEXT: v_bfe_u32 v4, v9, 16, 1
-; GFX11-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_add_i32 s7, s7, s6
-; GFX11-NEXT: s_bitset1_b32 s6, 22
-; GFX11-NEXT: s_addk_i32 s7, 0x7fff
-; GFX11-NEXT: s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s4, s6, s7
-; GFX11-NEXT: s_lshl_b32 s6, s27, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v4, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v12
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_lshr_b32 s73, s4, 16
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v49
-; GFX11-NEXT: v_readfirstlane_b32 s6, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v51
-; GFX11-NEXT: v_lshl_or_b32 v66, v1, 16, v11
-; GFX11-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: s_add_i32 s7, s7, s6
-; GFX11-NEXT: s_bitset1_b32 s6, 22
-; GFX11-NEXT: s_addk_i32 s7, 0x7fff
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s4, s6, s7
-; GFX11-NEXT: s_and_b32 s6, s26, 0xffff0000
-; GFX11-NEXT: s_lshr_b32 s27, s4, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s6
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v52
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v39
-; GFX11-NEXT: v_lshl_or_b32 v55, v50, 16, v4
-; GFX11-NEXT: s_pack_ll_b32_b16 s8, s22, s13
-; GFX11-NEXT: v_readfirstlane_b32 s6, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_lshl_or_b32 v54, v2, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v67, v48, 16, v5
-; GFX11-NEXT: v_lshrrev_b64 v[8:9], 24, v[17:18]
-; GFX11-NEXT: s_bfe_u32 s5, s6, 0x10010
-; GFX11-NEXT: v_lshrrev_b64 v[9:10], 24, v[15:16]
-; GFX11-NEXT: s_add_i32 s5, s5, s6
-; GFX11-NEXT: s_bitset1_b32 s6, 22
-; GFX11-NEXT: s_addk_i32 s5, 0x7fff
-; GFX11-NEXT: s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s14, s6, s5
-; GFX11-NEXT: s_lshl_b32 s4, s26, 16
-; GFX11-NEXT: s_pack_ll_b32_b16 s6, s20, s10
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
-; GFX11-NEXT: s_lshr_b32 s13, s14, 16
-; GFX11-NEXT: v_lshrrev_b64 v[10:11], 24, v[13:14]
-; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[6:7]
-; GFX11-NEXT: s_pack_ll_b32_b16 s29, s1, s58
-; GFX11-NEXT: v_readfirstlane_b32 s11, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_lshrrev_b64 v[1:2], 24, v[54:55]
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], 24, v[66:67]
-; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[70:71]
-; GFX11-NEXT: s_bfe_u32 s10, s11, 0x10010
-; GFX11-NEXT: v_lshrrev_b64 v[4:5], 24, v[19:20]
-; GFX11-NEXT: s_add_i32 s10, s10, s11
-; GFX11-NEXT: s_bitset1_b32 s11, 22
-; GFX11-NEXT: s_addk_i32 s10, 0x7fff
-; GFX11-NEXT: s_and_b32 s14, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s10, s11, s10
-; GFX11-NEXT: s_pack_ll_b32_b16 s5, s19, s60
-; GFX11-NEXT: s_lshr_b32 s26, s10, 16
-; GFX11-NEXT: s_pack_ll_b32_b16 s4, s18, s40
-; GFX11-NEXT: s_pack_ll_b32_b16 s9, s23, s62
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v55
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 8, v55
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v54
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 8, v54
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 24, v67
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v67
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v66
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 8, v66
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 24, v71
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 8, v71
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v70
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 8, v70
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 24, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 8, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 8, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 8, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 8, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 8, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 8, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 8, v6
-; GFX11-NEXT: s_pack_ll_b32_b16 s7, s21, s61
-; GFX11-NEXT: s_pack_ll_b32_b16 s11, s25, s63
-; GFX11-NEXT: s_pack_ll_b32_b16 s57, s27, s73
-; GFX11-NEXT: s_pack_ll_b32_b16 s56, s26, s13
-; GFX11-NEXT: s_pack_ll_b32_b16 s10, s24, s12
-; GFX11-NEXT: s_lshr_b64 s[94:95], s[8:9], 24
-; GFX11-NEXT: s_lshr_b64 s[12:13], s[4:5], 24
-; GFX11-NEXT: s_lshr_b64 s[14:15], s[46:47], 24
-; GFX11-NEXT: s_lshr_b64 s[40:41], s[44:45], 24
-; GFX11-NEXT: s_lshr_b64 s[42:43], s[28:29], 24
-; GFX11-NEXT: s_lshr_b64 vcc, s[56:57], 24
-; GFX11-NEXT: s_lshr_b64 s[34:35], s[10:11], 24
-; GFX11-NEXT: s_lshr_b64 s[30:31], s[6:7], 24
-; GFX11-NEXT: s_lshr_b32 s13, s57, 24
-; GFX11-NEXT: s_lshr_b32 s15, s57, 8
-; GFX11-NEXT: s_lshr_b32 s41, s56, 16
-; GFX11-NEXT: s_lshr_b32 s43, s56, 8
-; GFX11-NEXT: s_lshr_b32 s56, s11, 24
-; GFX11-NEXT: s_lshr_b32 s11, s11, 8
-; GFX11-NEXT: s_lshr_b32 s57, s10, 16
-; GFX11-NEXT: s_lshr_b32 s10, s10, 8
-; GFX11-NEXT: s_lshr_b32 s74, s9, 24
-; GFX11-NEXT: s_lshr_b32 s9, s9, 8
-; GFX11-NEXT: s_lshr_b32 s75, s8, 16
-; GFX11-NEXT: s_lshr_b32 s8, s8, 8
-; GFX11-NEXT: s_lshr_b32 s76, s7, 24
-; GFX11-NEXT: s_lshr_b32 s77, s7, 8
-; GFX11-NEXT: s_lshr_b32 s78, s6, 16
-; GFX11-NEXT: s_lshr_b32 s79, s6, 8
-; GFX11-NEXT: s_lshr_b32 s88, s5, 24
-; GFX11-NEXT: s_lshr_b32 s89, s5, 8
-; GFX11-NEXT: s_lshr_b32 s90, s4, 16
-; GFX11-NEXT: s_lshr_b32 s91, s4, 8
-; GFX11-NEXT: s_lshr_b32 s92, s47, 24
-; GFX11-NEXT: s_lshr_b32 s47, s47, 8
-; GFX11-NEXT: s_lshr_b32 s93, s46, 16
-; GFX11-NEXT: s_lshr_b32 s46, s46, 8
-; GFX11-NEXT: s_lshr_b32 s95, s45, 24
-; GFX11-NEXT: s_lshr_b32 s45, s45, 8
-; GFX11-NEXT: s_lshr_b32 s99, s44, 16
-; GFX11-NEXT: s_lshr_b32 s100, s44, 8
-; GFX11-NEXT: s_lshr_b32 s101, s29, 24
-; GFX11-NEXT: s_lshr_b32 s102, s29, 8
-; GFX11-NEXT: s_lshr_b32 s103, s28, 16
-; GFX11-NEXT: s_lshr_b32 s104, s28, 8
-; GFX11-NEXT: s_branch .LBB91_5
-; GFX11-NEXT: .LBB91_3:
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr74
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr104
-; GFX11-NEXT: ; implicit-def: $sgpr103
-; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: ; implicit-def: $sgpr102
-; GFX11-NEXT: ; implicit-def: $sgpr11
-; GFX11-NEXT: ; implicit-def: $sgpr101
-; GFX11-NEXT: ; implicit-def: $sgpr100
-; GFX11-NEXT: ; implicit-def: $sgpr99
-; GFX11-NEXT: ; implicit-def: $sgpr40
-; GFX11-NEXT: ; implicit-def: $sgpr14
-; GFX11-NEXT: ; implicit-def: $sgpr12
-; GFX11-NEXT: ; implicit-def: $sgpr49
-; GFX11-NEXT: ; implicit-def: $sgpr37
-; GFX11-NEXT: ; implicit-def: $sgpr35
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: ; implicit-def: $sgpr34
-; GFX11-NEXT: ; implicit-def: $sgpr52
-; GFX11-NEXT: ; implicit-def: $sgpr53
-; GFX11-NEXT: ; implicit-def: $sgpr50
-; GFX11-NEXT: ; implicit-def: $sgpr7
-; GFX11-NEXT: ; implicit-def: $sgpr36
-; GFX11-NEXT: ; implicit-def: $sgpr64
-; GFX11-NEXT: ; implicit-def: $sgpr38
-; GFX11-NEXT: ; implicit-def: $sgpr54
-; GFX11-NEXT: ; implicit-def: $sgpr96
-; GFX11-NEXT: ; implicit-def: $sgpr51
-; GFX11-NEXT: ; implicit-def: $sgpr67
-; GFX11-NEXT: ; implicit-def: $sgpr68
-; GFX11-NEXT: ; implicit-def: $sgpr65
-; GFX11-NEXT: ; implicit-def: $sgpr8
-; GFX11-NEXT: ; implicit-def: $sgpr55
-; GFX11-NEXT: ; implicit-def: $sgpr39
-; GFX11-NEXT: ; implicit-def: $sgpr71
-; GFX11-NEXT: ; implicit-def: $sgpr69
-; GFX11-NEXT: ; implicit-def: $sgpr9
-; GFX11-NEXT: ; implicit-def: $sgpr66
-; GFX11-NEXT: ; implicit-def: $sgpr82
-; GFX11-NEXT: ; implicit-def: $sgpr83
-; GFX11-NEXT: ; implicit-def: $sgpr80
-; GFX11-NEXT: ; implicit-def: $sgpr97
-; GFX11-NEXT: ; implicit-def: $sgpr70
-; GFX11-NEXT: ; implicit-def: $sgpr48
-; GFX11-NEXT: ; implicit-def: $sgpr84
-; GFX11-NEXT: ; implicit-def: $sgpr98
-; GFX11-NEXT: ; implicit-def: $sgpr81
-; GFX11-NEXT: ; implicit-def: $sgpr86
-; GFX11-NEXT: ; implicit-def: $sgpr87
-; GFX11-NEXT: ; implicit-def: $sgpr10
-; GFX11-NEXT: ; implicit-def: $sgpr85
-; GFX11-NEXT: ; implicit-def: $sgpr30
-; GFX11-NEXT: ; implicit-def: $sgpr94
-; GFX11-NEXT: ; implicit-def: $sgpr92
-; GFX11-NEXT: ; implicit-def: $sgpr90
-; GFX11-NEXT: ; implicit-def: $sgpr88
-; GFX11-NEXT: ; implicit-def: $sgpr78
-; GFX11-NEXT: ; implicit-def: $sgpr76
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: v_writelane_b32 v43, s4, 0
-; GFX11-NEXT: v_writelane_b32 v43, s5, 1
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: v_writelane_b32 v43, s4, 2
-; GFX11-NEXT: v_writelane_b32 v43, s5, 3
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: v_writelane_b32 v43, s74, 4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: v_writelane_b32 v43, s75, 5
-; GFX11-NEXT: ; implicit-def: $sgpr74
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr5
-; GFX11-NEXT: v_writelane_b32 v43, s74, 6
-; GFX11-NEXT: v_writelane_b32 v43, s75, 7
-; GFX11-NEXT: ; implicit-def: $sgpr74
-; GFX11-NEXT: s_branch .LBB91_2
-; GFX11-NEXT: .LBB91_4:
-; GFX11-NEXT: v_dual_mov_b32 v10, s94 :: v_dual_mov_b32 v11, s30
-; GFX11-NEXT: v_readlane_b32 s94, v43, 2
-; GFX11-NEXT: v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34
-; GFX11-NEXT: v_dual_mov_b32 v6, s49 :: v_dual_mov_b32 v7, s35
-; GFX11-NEXT: v_readlane_b32 s95, v43, 3
-; GFX11-NEXT: v_readlane_b32 vcc_lo, v43, 6
-; GFX11-NEXT: v_readlane_b32 s30, v43, 0
-; GFX11-NEXT: v_readlane_b32 s34, v43, 4
-; GFX11-NEXT: v_dual_mov_b32 v52, s44 :: v_dual_mov_b32 v51, s45
-; GFX11-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v49, s46
-; GFX11-NEXT: v_dual_mov_b32 v39, s47 :: v_dual_mov_b32 v48, s98
-; GFX11-NEXT: v_dual_mov_b32 v38, s56 :: v_dual_mov_b32 v37, s97
-; GFX11-NEXT: v_dual_mov_b32 v36, s57 :: v_dual_mov_b32 v35, s58
-; GFX11-NEXT: v_dual_mov_b32 v34, s59 :: v_dual_mov_b32 v33, s9
-; GFX11-NEXT: v_dual_mov_b32 v32, s60 :: v_dual_mov_b32 v31, s61
-; GFX11-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v29, s62
-; GFX11-NEXT: v_dual_mov_b32 v27, s63 :: v_dual_mov_b32 v28, s96
-; GFX11-NEXT: v_dual_mov_b32 v26, s72 :: v_dual_mov_b32 v25, s7
-; GFX11-NEXT: v_dual_mov_b32 v24, s73 :: v_dual_mov_b32 v23, s28
-; GFX11-NEXT: v_dual_mov_b32 v21, s29 :: v_dual_mov_b32 v22, s6
-; GFX11-NEXT: v_dual_mov_b32 v53, s87 :: v_dual_mov_b32 v54, s86
-; GFX11-NEXT: v_dual_mov_b32 v5, s85 :: v_dual_mov_b32 v12, s5
-; GFX11-NEXT: v_dual_mov_b32 v65, s4 :: v_dual_mov_b32 v66, s48
-; GFX11-NEXT: v_dual_mov_b32 v55, s81 :: v_dual_mov_b32 v64, s84
-; GFX11-NEXT: v_dual_mov_b32 v69, s83 :: v_dual_mov_b32 v70, s82
-; GFX11-NEXT: v_dual_mov_b32 v67, s70 :: v_dual_mov_b32 v68, s80
-; GFX11-NEXT: v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v19, s39
-; GFX11-NEXT: v_dual_mov_b32 v71, s66 :: v_dual_mov_b32 v20, s69
-; GFX11-NEXT: v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v17, s67
-; GFX11-NEXT: v_dual_mov_b32 v81, s55 :: v_dual_mov_b32 v18, s65
-; GFX11-NEXT: v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v15, s64
-; GFX11-NEXT: v_dual_mov_b32 v83, s51 :: v_dual_mov_b32 v16, s54
-; GFX11-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v13, s52
-; GFX11-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v14, s50
-; GFX11-NEXT: v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76
-; GFX11-NEXT: v_dual_mov_b32 v3, s78 :: v_dual_mov_b32 v4, s88
-; GFX11-NEXT: v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92
-; GFX11-NEXT: s_mov_b32 s58, s11
-; GFX11-NEXT: v_readlane_b32 s59, v43, 8
-; GFX11-NEXT: v_readlane_b32 s72, v43, 9
-; GFX11-NEXT: v_readlane_b32 s60, v43, 10
-; GFX11-NEXT: v_readlane_b32 s61, v43, 11
-; GFX11-NEXT: v_readlane_b32 s62, v43, 12
-; GFX11-NEXT: v_readlane_b32 s63, v43, 13
-; GFX11-NEXT: v_readlane_b32 s73, v43, 14
-; GFX11-NEXT: v_readlane_b32 s13, v43, 15
-; GFX11-NEXT: v_readlane_b32 s15, v43, 16
-; GFX11-NEXT: v_readlane_b32 s41, v43, 17
-; GFX11-NEXT: v_readlane_b32 s43, v43, 18
-; GFX11-NEXT: v_readlane_b32 s56, v43, 19
-; GFX11-NEXT: v_readlane_b32 s11, v43, 20
-; GFX11-NEXT: v_readlane_b32 s57, v43, 21
-; GFX11-NEXT: v_readlane_b32 s10, v43, 22
-; GFX11-NEXT: v_readlane_b32 s74, v43, 23
-; GFX11-NEXT: v_readlane_b32 s9, v43, 24
-; GFX11-NEXT: v_readlane_b32 s75, v43, 25
-; GFX11-NEXT: v_readlane_b32 s8, v43, 26
-; GFX11-NEXT: v_readlane_b32 s76, v43, 27
-; GFX11-NEXT: v_readlane_b32 s77, v43, 28
-; GFX11-NEXT: v_readlane_b32 s78, v43, 29
-; GFX11-NEXT: v_readlane_b32 s79, v43, 30
-; GFX11-NEXT: v_readlane_b32 s88, v43, 31
-; GFX11-NEXT: v_readlane_b32 s89, v42, 0
-; GFX11-NEXT: v_readlane_b32 s90, v42, 1
-; GFX11-NEXT: v_readlane_b32 s91, v42, 2
-; GFX11-NEXT: v_readlane_b32 s92, v42, 3
-; GFX11-NEXT: v_readlane_b32 s47, v42, 4
-; GFX11-NEXT: v_readlane_b32 s93, v42, 5
-; GFX11-NEXT: v_readlane_b32 vcc_hi, v43, 7
-; GFX11-NEXT: v_readlane_b32 s46, v42, 6
-; GFX11-NEXT: v_readlane_b32 s31, v43, 1
-; GFX11-NEXT: v_readlane_b32 s95, v42, 7
-; GFX11-NEXT: v_readlane_b32 s45, v42, 8
-; GFX11-NEXT: v_readlane_b32 s35, v43, 5
-; GFX11-NEXT: .LBB91_5: ; %end
-; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s4, s104, 8
-; GFX11-NEXT: s_and_b32 s5, s103, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s42, 8
-; GFX11-NEXT: s_or_b32 s0, s0, s4
-; GFX11-NEXT: s_or_b32 s4, s5, s6
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_lshl_b32 s5, s102, 8
-; GFX11-NEXT: s_and_b32 s6, s58, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s101, 8
-; GFX11-NEXT: s_or_b32 s1, s1, s5
-; GFX11-NEXT: s_or_b32 s5, s6, s7
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: s_or_b32 s0, s0, s4
-; GFX11-NEXT: s_or_b32 s1, s1, s5
-; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s4, s100, 8
-; GFX11-NEXT: s_and_b32 s5, s99, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s40, 8
-; GFX11-NEXT: s_or_b32 s2, s2, s4
-; GFX11-NEXT: s_or_b32 s4, s5, s6
-; GFX11-NEXT: s_and_b32 s3, s3, 0xff
-; GFX11-NEXT: s_lshl_b32 s5, s45, 8
-; GFX11-NEXT: s_and_b32 s6, s59, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s95, 8
-; GFX11-NEXT: s_or_b32 s3, s3, s5
-; GFX11-NEXT: s_or_b32 s5, s6, s7
-; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: s_or_b32 s2, s2, s4
-; GFX11-NEXT: s_or_b32 s3, s3, s5
-; GFX11-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
-; GFX11-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
-; GFX11-NEXT: s_and_b32 s0, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s46, 8
-; GFX11-NEXT: s_and_b32 s2, s93, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s14, 8
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_or_b32 s1, s2, s3
-; GFX11-NEXT: s_and_b32 s2, s17, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s47, 8
-; GFX11-NEXT: s_and_b32 s4, s72, 0xff
-; GFX11-NEXT: s_lshl_b32 s5, s92, 8
-; GFX11-NEXT: s_or_b32 s2, s2, s3
-; GFX11-NEXT: s_or_b32 s3, s4, s5
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_or_b32 s1, s2, s3
-; GFX11-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s91, 8
-; GFX11-NEXT: s_and_b32 s4, s90, 0xff
-; GFX11-NEXT: s_lshl_b32 s5, s12, 8
-; GFX11-NEXT: s_or_b32 s2, s2, s3
-; GFX11-NEXT: s_or_b32 s3, s4, s5
-; GFX11-NEXT: s_and_b32 s4, s19, 0xff
-; GFX11-NEXT: s_lshl_b32 s5, s89, 8
-; GFX11-NEXT: s_and_b32 s6, s60, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s88, 8
-; GFX11-NEXT: s_or_b32 s4, s4, s5
-; GFX11-NEXT: s_or_b32 s5, s6, s7
-; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: s_or_b32 s2, s2, s3
-; GFX11-NEXT: s_or_b32 s3, s4, s5
-; GFX11-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1
-; GFX11-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3
-; GFX11-NEXT: s_and_b32 s0, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s79, 8
-; GFX11-NEXT: s_and_b32 s2, s78, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s30, 8
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_or_b32 s1, s2, s3
-; GFX11-NEXT: s_and_b32 s2, s21, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s77, 8
-; GFX11-NEXT: s_and_b32 s4, s61, 0xff
-; GFX11-NEXT: s_lshl_b32 s5, s76, 8
-; GFX11-NEXT: s_or_b32 s2, s2, s3
-; GFX11-NEXT: s_or_b32 s3, s4, s5
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_or_b32 s1, s2, s3
-; GFX11-NEXT: s_and_b32 s2, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s8, 8
-; GFX11-NEXT: s_and_b32 s4, s75, 0xff
-; GFX11-NEXT: s_lshl_b32 s5, s94, 8
-; GFX11-NEXT: s_or_b32 s2, s2, s3
-; GFX11-NEXT: s_or_b32 s3, s4, s5
-; GFX11-NEXT: s_and_b32 s4, s23, 0xff
-; GFX11-NEXT: s_lshl_b32 s5, s9, 8
-; GFX11-NEXT: s_and_b32 s6, s62, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s74, 8
-; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_or_b32 s4, s4, s5
-; GFX11-NEXT: s_or_b32 s5, s6, s7
-; GFX11-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: s_or_b32 s2, s2, s3
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: scratch_store_b128 v0, v[97:100], off
-; GFX11-NEXT: scratch_store_b128 v0, v[112:115], off offset:16
-; GFX11-NEXT: s_or_b32 s3, s4, s5
-; GFX11-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
-; GFX11-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
-; GFX11-NEXT: s_and_b32 s0, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s10, 8
-; GFX11-NEXT: s_and_b32 s2, s57, 0xff
-; GFX11-NEXT: s_lshl_b32 s4, s34, 8
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_or_b32 s1, s2, s4
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_lshl_b32 s2, s11, 8
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_and_b32 s1, s25, 0xff
-; GFX11-NEXT: s_and_b32 s3, s63, 0xff
-; GFX11-NEXT: s_lshl_b32 s4, s56, 8
-; GFX11-NEXT: s_or_b32 s1, s1, s2
-; GFX11-NEXT: s_or_b32 s2, s3, s4
-; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: s_and_b32 s3, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s4, s43, 8
-; GFX11-NEXT: s_or_b32 s1, s1, s2
-; GFX11-NEXT: s_or_b32 s2, s3, s4
-; GFX11-NEXT: s_and_b32 s3, s41, 0xff
-; GFX11-NEXT: s_lshl_b32 s4, vcc_lo, 8
-; GFX11-NEXT: s_lshl_b32 s5, s15, 8
-; GFX11-NEXT: s_or_b32 s3, s3, s4
-; GFX11-NEXT: s_and_b32 s4, s27, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s13, 8
-; GFX11-NEXT: s_or_b32 s4, s4, s5
-; GFX11-NEXT: s_and_b32 s5, s73, 0xff
-; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT: s_or_b32 s5, s5, s6
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_and_b32 v23, 0xff, v23
-; GFX11-NEXT: v_dual_mov_b32 v113, s1 :: v_dual_lshlrev_b32 v6, 8, v6
-; GFX11-NEXT: s_or_b32 s2, s2, s3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v11, 8, v11
-; GFX11-NEXT: s_or_b32 s3, s4, s5
-; GFX11-NEXT: v_dual_mov_b32 v115, s3 :: v_dual_and_b32 v96, 0xff, v96
-; GFX11-NEXT: v_or_b32_e32 v6, v23, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 8, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v11, v96, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 8, v10
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24
-; GFX11-NEXT: v_lshlrev_b32_e32 v14, 8, v14
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v16
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 8, v8
-; GFX11-NEXT: v_or_b32_e32 v23, v6, v11
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v21
-; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v22
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v87
-; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v26
-; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v86
-; GFX11-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GFX11-NEXT: v_or_b32_e32 v7, v11, v21
-; GFX11-NEXT: v_or_b32_e32 v11, v22, v13
-; GFX11-NEXT: v_or_b32_e32 v10, v26, v10
-; GFX11-NEXT: v_or_b32_e32 v13, v24, v14
-; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v25
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v85
-; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v29
-; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v84
-; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v27
-; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v28
-; GFX11-NEXT: v_lshlrev_b32_e32 v27, 8, v83
-; GFX11-NEXT: v_or_b32_e32 v14, v14, v21
-; GFX11-NEXT: v_or_b32_e32 v15, v22, v15
-; GFX11-NEXT: v_or_b32_e32 v9, v24, v9
-; GFX11-NEXT: v_or_b32_e32 v16, v25, v16
-; GFX11-NEXT: v_or_b32_e32 v21, v26, v27
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT: v_or_b32_e32 v24, v6, v7
-; GFX11-NEXT: v_or_b32_e32 v25, v11, v10
-; GFX11-NEXT: v_or_b32_e32 v26, v13, v14
-; GFX11-NEXT: v_or_b32_e32 v6, v15, v9
-; GFX11-NEXT: v_or_b32_e32 v7, v16, v21
-; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v32
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 8, v17
-; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v82
-; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v31
-; GFX11-NEXT: v_lshlrev_b32_e32 v14, 8, v18
-; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v30
-; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v81
-; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v19
-; GFX11-NEXT: v_or_b32_e32 v9, v9, v10
-; GFX11-NEXT: v_or_b32_e32 v8, v11, v8
-; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
-; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
-; GFX11-NEXT: v_or_b32_e32 v13, v17, v18
-; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v80
-; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v34
-; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v20
-; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v33
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v71
-; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v70
-; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v69
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-NEXT: v_or_b32_e32 v4, v14, v4
-; GFX11-NEXT: v_or_b32_e32 v14, v15, v16
-; GFX11-NEXT: v_or_b32_e32 v15, v17, v18
-; GFX11-NEXT: v_or_b32_e32 v16, v19, v20
-; GFX11-NEXT: v_or_b32_e32 v3, v21, v3
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v8, v9, v8
-; GFX11-NEXT: v_or_b32_e32 v9, v10, v11
-; GFX11-NEXT: v_or_b32_e32 v13, v13, v4
-; GFX11-NEXT: v_or_b32_e32 v14, v14, v15
-; GFX11-NEXT: v_or_b32_e32 v15, v16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v36
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v68
-; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v37
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v67
-; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v49
-; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v66
-; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v65
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v39
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v64
-; GFX11-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v4, v10, v11
-; GFX11-NEXT: v_or_b32_e32 v10, v16, v17
-; GFX11-NEXT: v_or_b32_e32 v2, v18, v2
-; GFX11-NEXT: v_or_b32_e32 v11, v19, v20
-; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v48
-; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v55
-; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v52
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v54
-; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v53
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v51
-; GFX11-NEXT: v_lshlrev_b32_e32 v12, 8, v12
-; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v50
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX11-NEXT: v_or_b32_e32 v17, v18, v19
-; GFX11-NEXT: v_or_b32_e32 v1, v20, v1
-; GFX11-NEXT: v_or_b32_e32 v12, v21, v12
-; GFX11-NEXT: v_or_b32_e32 v5, v22, v5
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_or_b32_e32 v16, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v1, v10, v2
-; GFX11-NEXT: v_or_b32_e32 v2, v11, v18
-; GFX11-NEXT: v_or_b32_e32 v3, v17, v19
-; GFX11-NEXT: v_or_b32_e32 v4, v12, v5
-; GFX11-NEXT: s_clause 0x5
-; GFX11-NEXT: scratch_store_b128 v0, v[97:100], off offset:32
-; GFX11-NEXT: scratch_store_b128 v0, v[112:115], off offset:48
-; GFX11-NEXT: scratch_store_b128 v0, v[23:26], off offset:64
-; GFX11-NEXT: scratch_store_b128 v0, v[6:9], off offset:80
-; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:96
-; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT: v_readlane_b32 s104, v41, 8
-; GFX11-NEXT: v_readlane_b32 s103, v41, 7
-; GFX11-NEXT: v_readlane_b32 s102, v41, 6
-; GFX11-NEXT: v_readlane_b32 s101, v41, 5
-; GFX11-NEXT: v_readlane_b32 s100, v41, 4
-; GFX11-NEXT: v_readlane_b32 s99, v41, 3
-; GFX11-NEXT: v_readlane_b32 s98, v41, 2
-; GFX11-NEXT: v_readlane_b32 s97, v41, 1
-; GFX11-NEXT: v_readlane_b32 s96, v41, 0
-; GFX11-NEXT: v_readlane_b32 s87, v40, 31
-; GFX11-NEXT: v_readlane_b32 s86, v40, 30
-; GFX11-NEXT: v_readlane_b32 s85, v40, 29
-; GFX11-NEXT: v_readlane_b32 s84, v40, 28
-; GFX11-NEXT: v_readlane_b32 s83, v40, 27
-; GFX11-NEXT: v_readlane_b32 s82, v40, 26
-; GFX11-NEXT: v_readlane_b32 s81, v40, 25
-; GFX11-NEXT: v_readlane_b32 s80, v40, 24
-; GFX11-NEXT: v_readlane_b32 s71, v40, 23
-; GFX11-NEXT: v_readlane_b32 s70, v40, 22
-; GFX11-NEXT: v_readlane_b32 s69, v40, 21
-; GFX11-NEXT: v_readlane_b32 s68, v40, 20
-; GFX11-NEXT: v_readlane_b32 s67, v40, 19
-; GFX11-NEXT: v_readlane_b32 s66, v40, 18
-; GFX11-NEXT: v_readlane_b32 s65, v40, 17
-; GFX11-NEXT: v_readlane_b32 s64, v40, 16
-; GFX11-NEXT: v_readlane_b32 s55, v40, 15
-; GFX11-NEXT: v_readlane_b32 s54, v40, 14
-; GFX11-NEXT: v_readlane_b32 s53, v40, 13
-; GFX11-NEXT: v_readlane_b32 s52, v40, 12
-; GFX11-NEXT: v_readlane_b32 s51, v40, 11
-; GFX11-NEXT: v_readlane_b32 s50, v40, 10
-; GFX11-NEXT: v_readlane_b32 s49, v40, 9
-; GFX11-NEXT: v_readlane_b32 s48, v40, 8
-; GFX11-NEXT: v_readlane_b32 s39, v40, 7
-; GFX11-NEXT: v_readlane_b32 s38, v40, 6
-; GFX11-NEXT: v_readlane_b32 s37, v40, 5
-; GFX11-NEXT: v_readlane_b32 s36, v40, 4
-; GFX11-NEXT: v_readlane_b32 s35, v40, 3
-; GFX11-NEXT: v_readlane_b32 s34, v40, 2
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: scratch_load_b32 v40, off, s32
-; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:12
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v128i8_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1
+; GFX11-TRUE16-NEXT: s_clause 0x3
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:12
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s96, 0
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s72, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s73, v2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s97, 1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s62, v3
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s63, v4
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s60, v5
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s34, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s98, 2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s61, v6
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v7
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s59, v8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s35, 3
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s99, 3
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s46, v9
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s47, v10
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s44, v11
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s100, 4
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s45, v12
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v13
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s43, v14
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 5
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s101, 5
+; GFX11-TRUE16-NEXT: s_mov_b32 vcc_hi, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s102, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 7
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s103, 7
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s104, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 9
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 10
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 11
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s52, 12
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s53, 13
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s54, 14
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s55, 15
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s64, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s65, 17
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s66, 18
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s67, 19
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s68, 20
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s69, 21
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s70, 22
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s71, 23
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s80, 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s81, 25
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s82, 26
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s83, 27
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s84, 28
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s85, 29
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s86, 30
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s87, 31
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB91_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[26:27], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 15
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s2, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s1, 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 14
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s1, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s0, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s43, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s43, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 17
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s26, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s43, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s42, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s42, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 18
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s45, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s45, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s45, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 19
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s44, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s47, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s47, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 13
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s47, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s46, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s46, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 20
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s59, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s59, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s59, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 21
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s24, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s58, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s58, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s61, 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 22
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s61, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s61, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s60, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 23
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s60, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s63, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s63, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 12
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s63, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s62, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s62, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s73, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s73, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s73, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 25
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s22, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s72, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s72, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s29, 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 26
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s29, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s28, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 27
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s28, 8
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[2:3], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 11
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 8
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[0:1], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[42:43], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[44:45], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 28
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[46:47], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[58:59], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[60:61], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 29
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s20, 8
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[62:63], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[72:73], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 30
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 31
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 10
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 2
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 3
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 9
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 4
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 5
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 6
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 7
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s44, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s12, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s13, 7
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s12, 4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s13, 5
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[22:23], 24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s12, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s13, 3
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s12, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s13, 1
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB91_4
+; GFX11-TRUE16-NEXT: .LBB91_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s29, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s29, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s78, s28, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s15, s45, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s28, s45, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s43, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s43, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s73, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s77, s73, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s76, s72, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s75, s72, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s11, s63, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s74, s63, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s73, s62, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s72, s62, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s62, s61, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s63, s61, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_and_b32 s61, s60, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s57, s60, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s40, s59, 0xffff0000
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s45, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s59, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s29, s58, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s58, 16
+; GFX11-TRUE16-NEXT: s_bfe_u32 s4, s45, 0x10010
+; GFX11-TRUE16-NEXT: s_and_b32 s12, s47, 0xffff0000
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s4, s45
+; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s47, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s47, s46, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s41, s46, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s44, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s44, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s42, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s42, 16
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s45, 22
+; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s42, s45, s43
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s78
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v2
+; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s42, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v6
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s1, 0x10010
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.l
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s1
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s1, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_cselect_b32 s1, s1, s43
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s77
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v25.l
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s42, 0x10010
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, s42
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s6, s42, s6
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s6, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v8, v3
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_bfe_u32 s42, s0, 0x10010
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_add_i32 s42, s42, s0
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s0, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s42, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s0, s0, s42
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s76
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v6
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s42, 0x10010
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s42
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s42, s42, s43
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s75
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v7
+; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s42, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s3, 0x10010
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s3
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s3, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, s43
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s74
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v26.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v28.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s11
+; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s42, 0x10010
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, s42
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s11, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s42, s11
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v7
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s11, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v3
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_bfe_u32 s42, s2, 0x10010
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_add_i32 s42, s42, s2
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s42, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, s2, s42
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s73
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v30.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s42, 0x10010
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s42
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s72
+; GFX11-TRUE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s42, s42, s43
+; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s42, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s17, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v29.l
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s17, 0x10010
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v31.l
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s17
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s17, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s17, s17, s43
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s63
+; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s17, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[4:5]
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s42, 0x10010
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s42
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s42, s42, s43
+; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s16, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s62
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s42, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s16, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v3
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s16, 0x10010
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s16
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s16, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s16, s16, s43
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s16, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s61
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s42, 0x10010
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s42
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s57
+; GFX11-TRUE16-NEXT: s_and_b32 s45, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s42, s42, s43
+; GFX11-TRUE16-NEXT: s_lshl_b32 s19, s19, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s42, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s19, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s19, 0x10010
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v32.l
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s19
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s19, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s19, s19, s43
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s56
+; GFX11-TRUE16-NEXT: s_lshr_b32 s19, s19, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v34.l
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s17, s60
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s16, s44
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s40
+; GFX11-TRUE16-NEXT: s_bfe_u32 s40, s42, 0x10010
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_add_i32 s40, s40, s42
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s40, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s42, s40
+; GFX11-TRUE16-NEXT: s_lshl_b32 s18, s18, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s40, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s18, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT: s_bfe_u32 s42, s18, 0x10010
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: s_add_i32 s42, s42, s18
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s18, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s42, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s18, s18, s42
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s29
+; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s18, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s29, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-TRUE16-NEXT: s_bfe_u32 s42, s29, 0x10010
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_add_i32 s42, s42, s29
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s29, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s42, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s14
+; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s29, s29, s42
+; GFX11-TRUE16-NEXT: s_lshl_b32 s21, s21, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s21
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s29, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s2, s11
+; GFX11-TRUE16-NEXT: s_bfe_u32 s21, s14, 0x10010
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.l
+; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, s14
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s21, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s29, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s14, s14, s21
+; GFX11-TRUE16-NEXT: s_and_b32 s21, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s21
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s13
+; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s14, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v36.l
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s3, s59
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s12
+; GFX11-TRUE16-NEXT: s_bfe_u32 s12, s13, 0x10010
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, s13
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s13, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s12, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s12, s13, s12
+; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s20, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s12, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT: s_bfe_u32 s14, s13, 0x10010
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: s_add_i32 s14, s14, s13
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s13, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s14, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s20, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s13, s14
+; GFX11-TRUE16-NEXT: s_and_b32 s14, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s47
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6
+; GFX11-TRUE16-NEXT: s_bfe_u32 s20, s14, 0x10010
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_add_i32 s29, s20, s14
+; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s13, 16
+; GFX11-TRUE16-NEXT: s_addk_i32 s29, 0x7fff
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22
+; GFX11-TRUE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s14, s29
+; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s23, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s41
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v7
+; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s13, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s28
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_bfe_u32 s23, s14, 0x10010
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s0, s6
+; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, s14
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s23, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s14, s23
+; GFX11-TRUE16-NEXT: s_and_b32 s14, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s13, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v38.l
+; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s14, s15
+; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s22, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s13, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v48.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v8
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s22, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: s_cselect_b32 s14, s14, s15
+; GFX11-TRUE16-NEXT: s_and_b32 s15, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s14, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v1.l
+; GFX11-TRUE16-NEXT: s_bfe_u32 s9, s10, 0x10010
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v7, 16, 1
+; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, s10
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s10, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s9, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_bfe_u32 s10, s8, 0x10010
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v51.l
+; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, s8
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s8, s10
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s8, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v10, v8
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v50.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v49.l
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s1, s58
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: s_bfe_u32 s7, s9, 0x10010
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s9
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s7, s9, s7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s24, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s7, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v53.l
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s8, 0x10010
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s7, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s5, s8, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s5, 16
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v8, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v2
+; GFX11-TRUE16-NEXT: s_bfe_u32 s7, s4, 0x10010
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s4
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s4, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s4, s7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s4, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v54.l
+; GFX11-TRUE16-NEXT: s_bfe_u32 s7, s5, 0x10010
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v52.l
+; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s5, 22
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s5, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s4, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v55.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[6:7], 24, v[22:23]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[20:21]
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v2.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[18:19]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[16:17]
+; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s5, 0x10010
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s63
+; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, s5
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s5, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s14, s5, s6
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s26, 16
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s12
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s14, 16
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s61
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s40
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 8, v65
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[1:2], 24, v[64:65]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[2:3], 24, v[68:69]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v65
+; GFX11-TRUE16-NEXT: s_bfe_u32 s12, s11, 0x10010
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v64
+; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, s11
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s11, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s12, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s12, s11, s12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v64
+; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s12, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 24, v69
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v69
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v68
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v68
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 24, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s62
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s72
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s27, s73
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s26, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[8:9], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[4:5], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[44:45], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[42:43], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[28:29], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 vcc, s[46:47], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[34:35], s[10:11], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[6:7], 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s47, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s47, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s46, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s46, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s11, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s10, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s10, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s9, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s9, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s8, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s8, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s7, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s7, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s6, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s6, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s5, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s5, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s4, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s4, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s45, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s45, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s44, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s44, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s43, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s43, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s42, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s42, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s29, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s28, 8
+; GFX11-TRUE16-NEXT: s_branch .LBB91_5
+; GFX11-TRUE16-NEXT: .LBB91_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s5, 1
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s5, 3
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s74, 4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s75, 5
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s74, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s75, 7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74
+; GFX11-TRUE16-NEXT: s_branch .LBB91_2
+; GFX11-TRUE16-NEXT: .LBB91_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s94 :: v_dual_mov_b32 v14, s30
+; GFX11-TRUE16-NEXT: v_readlane_b32 s94, v43, 2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s49 :: v_dual_mov_b32 v5, s35
+; GFX11-TRUE16-NEXT: v_readlane_b32 s95, v43, 3
+; GFX11-TRUE16-NEXT: v_readlane_b32 vcc_lo, v43, 6
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v43, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v43, 4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, s42 :: v_dual_mov_b32 v54, s43
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s10 :: v_dual_mov_b32 v53, s44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s45 :: v_dual_mov_b32 v49, s98
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, s46 :: v_dual_mov_b32 v38, s47
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s97 :: v_dual_mov_b32 v39, s58
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, s59 :: v_dual_mov_b32 v36, s60
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, s9 :: v_dual_mov_b32 v32, s61
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v33, s62
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, s63 :: v_dual_mov_b32 v30, s72
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, s96 :: v_dual_mov_b32 v26, s73
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s7 :: v_dual_mov_b32 v27, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s29 :: v_dual_mov_b32 v25, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s87 :: v_dual_mov_b32 v64, s86
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s85 :: v_dual_mov_b32 v10, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, s4 :: v_dual_mov_b32 v68, s48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, s81 :: v_dual_mov_b32 v66, s84
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s83 :: v_dual_mov_b32 v69, s70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s82 :: v_dual_mov_b32 v23, s80
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v71, s66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s39 :: v_dual_mov_b32 v21, s69
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v81, s55
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s67 :: v_dual_mov_b32 v19, s65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v83, s51
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s64 :: v_dual_mov_b32 v17, s54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v11, s52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v12, s50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s78 :: v_dual_mov_b32 v7, s88
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92
+; GFX11-TRUE16-NEXT: s_mov_b32 s58, s11
+; GFX11-TRUE16-NEXT: v_readlane_b32 s59, v43, 8
+; GFX11-TRUE16-NEXT: v_readlane_b32 s60, v43, 9
+; GFX11-TRUE16-NEXT: v_readlane_b32 s61, v43, 10
+; GFX11-TRUE16-NEXT: v_readlane_b32 s62, v43, 11
+; GFX11-TRUE16-NEXT: v_readlane_b32 s63, v43, 12
+; GFX11-TRUE16-NEXT: v_readlane_b32 s72, v43, 13
+; GFX11-TRUE16-NEXT: v_readlane_b32 s73, v43, 14
+; GFX11-TRUE16-NEXT: v_readlane_b32 s13, v43, 15
+; GFX11-TRUE16-NEXT: v_readlane_b32 s15, v43, 16
+; GFX11-TRUE16-NEXT: v_readlane_b32 s41, v43, 17
+; GFX11-TRUE16-NEXT: v_readlane_b32 s46, v43, 18
+; GFX11-TRUE16-NEXT: v_readlane_b32 s47, v43, 19
+; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v43, 20
+; GFX11-TRUE16-NEXT: v_readlane_b32 s57, v43, 21
+; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v43, 22
+; GFX11-TRUE16-NEXT: v_readlane_b32 s74, v43, 23
+; GFX11-TRUE16-NEXT: v_readlane_b32 s9, v43, 24
+; GFX11-TRUE16-NEXT: v_readlane_b32 s75, v43, 25
+; GFX11-TRUE16-NEXT: v_readlane_b32 s8, v43, 26
+; GFX11-TRUE16-NEXT: v_readlane_b32 s76, v43, 27
+; GFX11-TRUE16-NEXT: v_readlane_b32 s77, v43, 28
+; GFX11-TRUE16-NEXT: v_readlane_b32 s78, v43, 29
+; GFX11-TRUE16-NEXT: v_readlane_b32 s79, v43, 30
+; GFX11-TRUE16-NEXT: v_readlane_b32 s88, v43, 31
+; GFX11-TRUE16-NEXT: v_readlane_b32 s89, v42, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s90, v42, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s91, v42, 2
+; GFX11-TRUE16-NEXT: v_readlane_b32 s92, v42, 3
+; GFX11-TRUE16-NEXT: v_readlane_b32 s45, v42, 4
+; GFX11-TRUE16-NEXT: v_readlane_b32 s93, v42, 5
+; GFX11-TRUE16-NEXT: v_readlane_b32 vcc_hi, v43, 7
+; GFX11-TRUE16-NEXT: v_readlane_b32 s44, v42, 6
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v43, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s95, v42, 7
+; GFX11-TRUE16-NEXT: v_readlane_b32 s43, v42, 8
+; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v43, 5
+; GFX11-TRUE16-NEXT: .LBB91_5: ; %end
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s104, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s103, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s56, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s4
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s6
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s102, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s58, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s101, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s5
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s4
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s100, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s99, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s40, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s4
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s6
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s43, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s59, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s95, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s5
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s4
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s44, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s93, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s14, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s17, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s45, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s60, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s92, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s91, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s90, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s12, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s89, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s61, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s88, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s79, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s78, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s30, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s21, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s77, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s62, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s76, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s8, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s75, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s94, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s9, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s63, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s74, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[97:100], off
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:16
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s10, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s57, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s34, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s11, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s25, 0xff
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s72, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s47, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s46, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s41, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, vcc_lo, 8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s15, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s27, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s13, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s73, 0xff
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_and_b32 v27, 0xff, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v113, s1 :: v_dual_lshlrev_b32 v4, 8, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v115, s3 :: v_dual_and_b32 v96, 0xff, v96
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v5, 8, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v96, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v4, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v87
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v30
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xff, v86
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v26, v12
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v14, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v25, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v30, v13
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v83
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v26, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v28, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v29, v30
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v4, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v11, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v12, v14
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v17, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v20
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v14
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v16, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v37
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v16, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v18, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v11, v12
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v17, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v38
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v48
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v68
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v66
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v20, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v19, v20
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v21, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v8, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v18, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v10, v3
+; GFX11-TRUE16-NEXT: s_clause 0x5
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[97:100], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[27:30], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[4:7], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[15:18], off offset:112
+; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v41, 8
+; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v41, 7
+; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v41, 6
+; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v41, 5
+; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v41, 4
+; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v41, 3
+; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v41, 2
+; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v41, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v41, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v40, 31
+; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v40, 30
+; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v40, 29
+; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v40, 28
+; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v40, 27
+; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v40, 26
+; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v40, 25
+; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v40, 24
+; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v40, 23
+; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v40, 22
+; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v40, 21
+; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v40, 20
+; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v40, 19
+; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v40, 18
+; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v40, 17
+; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v40, 16
+; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v40, 15
+; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v40, 14
+; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v40, 13
+; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v40, 12
+; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v40, 11
+; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v40, 10
+; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v40, 9
+; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v40, 8
+; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v40, 7
+; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v40, 6
+; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v40, 5
+; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v40, 4
+; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v40, 3
+; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v40, 2
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1
+; GFX11-TRUE16-NEXT: s_clause 0x3
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:12
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1
+; GFX11-FAKE16-NEXT: s_clause 0x3
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:12
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s96, 0
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s72, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s73, v2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s97, 1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s62, v3
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s63, v4
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s60, v5
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s34, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s98, 2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s61, v6
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s58, v7
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s59, v8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s35, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s99, 3
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s56, v9
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s57, v10
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s46, v11
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s100, 4
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s47, v12
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v13
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s45, v14
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 5
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s101, 5
+; GFX11-FAKE16-NEXT: s_mov_b32 vcc_hi, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s102, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 7
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s103, 7
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s104, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 9
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 10
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 11
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s52, 12
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s53, 13
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s54, 14
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s55, 15
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s64, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s65, 17
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s66, 18
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s67, 19
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s68, 20
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s69, 21
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s70, 22
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s71, 23
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s80, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s81, 25
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s82, 26
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s83, 27
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s84, 28
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s85, 29
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s86, 30
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s87, 31
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB91_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[26:27], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 15
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s2, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s1, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 14
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s1, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s0, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s45, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s45, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 17
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s26, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s45, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s44, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s44, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 18
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s47, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s47, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s47, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 19
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s46, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s57, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s57, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 13
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s57, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s56, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s56, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 20
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s59, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s59, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s59, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 21
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s24, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s58, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s58, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s61, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 22
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s61, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s61, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s60, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 23
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s60, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s63, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s63, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 12
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s63, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s62, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s62, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s73, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s73, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s73, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 25
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s22, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s72, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s72, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s29, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 26
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s29, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s28, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 27
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s28, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[2:3], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 11
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[0:1], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[44:45], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[46:47], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 28
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[56:57], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[58:59], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[60:61], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 29
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s20, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[62:63], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[72:73], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 30
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 31
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 10
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s18, 8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 3
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 9
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 6
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 7
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s46, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 7
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 5
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[22:23], 24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 3
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 1
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB91_4
+; GFX11-FAKE16-NEXT: .LBB91_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s29, 0xffff0000
+; GFX11-FAKE16-NEXT: s_and_b32 s14, s47, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s47, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s45, 0xffff0000
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s47, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s45, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s78, s28, 0xffff0000
+; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s47, 0x10010
+; GFX11-FAKE16-NEXT: s_lshl_b32 s79, s28, 16
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s6, s47
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s73, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s77, s73, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s75, s72, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s76, s72, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s11, s63, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s74, s63, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s72, s62, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s73, s62, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s63, s61, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s62, s61, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s61, s60, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s60, s60, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s41, s59, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s40, s59, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s28, s58, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s29, s58, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s13, s57, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s57, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s42, s56, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s56, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s12, s46, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s46, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s44, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s44, 16
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s47, 22
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s47, s45
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s44, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s78
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s79
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s1, 0x10010
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s1
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s1, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s1, s1, s45
+; GFX11-FAKE16-NEXT: s_and_b32 s44, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v6
+; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s77
+; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s0, 0x10010
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v4
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s5, s0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s44, 16
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s0, 22
+; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s0, s45
+; GFX11-FAKE16-NEXT: s_and_b32 s44, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v2, 16, v3
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s76
+; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s44, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s75
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 24, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s3, 0x10010
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s3
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s3, 22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, s45
+; GFX11-FAKE16-NEXT: s_and_b32 s44, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v8, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v24
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s74
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v25, 16, v5
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s2, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s11
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: s_bfe_u32 s11, s2, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s11, s2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s44, 16
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s2, 22
+; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, s2, s45
+; GFX11-FAKE16-NEXT: s_and_b32 s44, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s73
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s17
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s72
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s17, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s44, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s17, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v27
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s17
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s17, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v28, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-FAKE16-NEXT: s_cselect_b32 s17, s17, s45
+; GFX11-FAKE16-NEXT: s_and_b32 s44, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s17, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s63
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 24, v16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v5, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v29
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v8, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 16
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s44, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s16, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s62
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s16, 0x10010
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s16
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s16, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s16, s16, s45
+; GFX11-FAKE16-NEXT: s_and_b32 s44, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s16, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s60
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v4
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s61
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s19, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s19
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s44, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v9, v8
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s19, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s19
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s19, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s19, s19, s45
+; GFX11-FAKE16-NEXT: s_and_b32 s44, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s19, s19, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s29
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s41
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s47, s17, s72
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s44, s41, 0x10010
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_add_i32 s44, s44, s41
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s41, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s44, 0x7fff
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s40
+; GFX11-FAKE16-NEXT: s_and_b32 s45, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s41, s41, s44
+; GFX11-FAKE16-NEXT: s_lshl_b32 s18, s18, 16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v30, 16, v4
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s18, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v1, 16, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: s_bfe_u32 s40, s18, 0x10010
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s28
+; GFX11-FAKE16-NEXT: s_add_i32 s44, s40, s18
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s41, 16
+; GFX11-FAKE16-NEXT: s_addk_i32 s44, 0x7fff
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s18, 22
+; GFX11-FAKE16-NEXT: s_and_b32 s41, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s18, s18, s44
+; GFX11-FAKE16-NEXT: s_and_b32 s41, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s41
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v9, 16, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s18, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s28, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v10, 16, 1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s29, s28, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v1
+; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, s28
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s28, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s41, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s28, s28, s29
+; GFX11-FAKE16-NEXT: s_lshl_b32 s21, s21, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s21
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s44, s2, s11
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s21, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s29, s21, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v2
+; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, s21
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s21, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s28, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s21, s21, s29
+; GFX11-FAKE16-NEXT: s_and_b32 s28, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s28
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v10
+; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s21, 16
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s45, s3, s59
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s46, s16, s46
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s13
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 24, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_bfe_u32 s28, s13, 0x10010
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v34
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, s13
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s13, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s29, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s13, s28
+; GFX11-FAKE16-NEXT: s_lshl_b32 s20, s20, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s20
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v35
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s20, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v2, 16, v9
+; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s20, 0x10010
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s10, s20
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s13, 16
+; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s20, 22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s20, s28
+; GFX11-FAKE16-NEXT: s_and_b32 s20, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s42
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s20
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s43
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s28, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_bfe_u32 s20, s28, 0x10010
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_add_i32 s29, s20, s28
+; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s13, 16
+; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s28, 22
+; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s28, s29
+; GFX11-FAKE16-NEXT: s_lshl_b32 s23, s23, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s13, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v9
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s23, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s28, s23, 0x10010
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, s23
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s23, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s23, s28
+; GFX11-FAKE16-NEXT: s_and_b32 s23, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s23
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s14
+; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s13, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v71, v37, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s12
+; GFX11-FAKE16-NEXT: s_bfe_u32 s15, s14, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: s_add_i32 s15, s15, s14
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s14, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s15, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s14, s15
+; GFX11-FAKE16-NEXT: s_lshl_b32 s14, s22, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v38
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s13, 16
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v70, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9
+; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s14, 0x10010
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8
+; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, s14
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s14, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s12, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s15, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s12, s14, s12
+; GFX11-FAKE16-NEXT: s_and_b32 s14, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s12, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: s_bfe_u32 s14, s9, 0x10010
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_add_i32 s14, s14, s9
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s9, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s14, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s9, s9, s14
+; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s25, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v3, v4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s9, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s8, 0x10010
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, s8
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s12, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_cselect_b32 s8, s8, s12
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s8, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v12, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s0, s5
+; GFX11-FAKE16-NEXT: s_bfe_u32 s9, s7, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v3
+; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, s7
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s7, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s7, s7, s9
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s24, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v12, 16, 1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s4, s8, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v2
+; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, s8
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s8, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v52, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v12
+; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s4, 16
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v52
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v9, 16, 1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s7, s6, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, s6
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s6, s7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s27, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v4, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s4, 16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v49
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v51
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v66, v1, 16, v11
+; GFX11-FAKE16-NEXT: s_bfe_u32 s7, s6, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, s6
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s6, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s4, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v52
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v55, v50, 16, v4
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s22, s13
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v54, v2, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v48, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[17:18]
+; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s6, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[15:16]
+; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s14, s6, s5
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s26, 16
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s20, s10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s14, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[13:14]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[6:7]
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s1, s58
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[1:2], 24, v[54:55]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[2:3], 24, v[66:67]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[70:71]
+; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s11, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[4:5], 24, v[19:20]
+; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, s11
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s19, s60
+; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s10, 16
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s18, s40
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s23, s62
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 24, v55
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 8, v55
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v54
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 8, v54
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 24, v67
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 8, v67
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v66
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 8, v66
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 24, v71
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 8, v71
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v70
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v70
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 8, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 8, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 8, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 8, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 8, v6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s21, s61
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s25, s63
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s57, s27, s73
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s56, s26, s13
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s24, s12
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[8:9], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[4:5], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[46:47], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[44:45], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[28:29], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 vcc, s[56:57], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[34:35], s[10:11], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[6:7], 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s57, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s57, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s56, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s56, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s11, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s11, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s10, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s10, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s9, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s9, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s8, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s8, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s7, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s7, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s6, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s6, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s5, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s5, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s4, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s4, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s47, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s47, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s46, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s46, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s45, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s45, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s44, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s44, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s29, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s29, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s28, 8
+; GFX11-FAKE16-NEXT: s_branch .LBB91_5
+; GFX11-FAKE16-NEXT: .LBB91_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr101
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s5, 1
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s5, 3
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s74, 4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s75, 5
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s74, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s75, 7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74
+; GFX11-FAKE16-NEXT: s_branch .LBB91_2
+; GFX11-FAKE16-NEXT: .LBB91_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s94 :: v_dual_mov_b32 v11, s30
+; GFX11-FAKE16-NEXT: v_readlane_b32 s94, v43, 2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s49 :: v_dual_mov_b32 v7, s35
+; GFX11-FAKE16-NEXT: v_readlane_b32 s95, v43, 3
+; GFX11-FAKE16-NEXT: v_readlane_b32 vcc_lo, v43, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v43, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v43, 4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s44 :: v_dual_mov_b32 v51, s45
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v49, s46
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v39, s47 :: v_dual_mov_b32 v48, s98
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s56 :: v_dual_mov_b32 v37, s97
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s57 :: v_dual_mov_b32 v35, s58
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s59 :: v_dual_mov_b32 v33, s9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s60 :: v_dual_mov_b32 v31, s61
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v29, s62
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s63 :: v_dual_mov_b32 v28, s96
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s72 :: v_dual_mov_b32 v25, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s73 :: v_dual_mov_b32 v23, s28
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s29 :: v_dual_mov_b32 v22, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, s87 :: v_dual_mov_b32 v54, s86
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s85 :: v_dual_mov_b32 v12, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v65, s4 :: v_dual_mov_b32 v66, s48
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v55, s81 :: v_dual_mov_b32 v64, s84
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v69, s83 :: v_dual_mov_b32 v70, s82
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v67, s70 :: v_dual_mov_b32 v68, s80
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v19, s39
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v71, s66 :: v_dual_mov_b32 v20, s69
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v17, s67
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v81, s55 :: v_dual_mov_b32 v18, s65
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v15, s64
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v83, s51 :: v_dual_mov_b32 v16, s54
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v13, s52
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v14, s50
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s78 :: v_dual_mov_b32 v4, s88
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92
+; GFX11-FAKE16-NEXT: s_mov_b32 s58, s11
+; GFX11-FAKE16-NEXT: v_readlane_b32 s59, v43, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s72, v43, 9
+; GFX11-FAKE16-NEXT: v_readlane_b32 s60, v43, 10
+; GFX11-FAKE16-NEXT: v_readlane_b32 s61, v43, 11
+; GFX11-FAKE16-NEXT: v_readlane_b32 s62, v43, 12
+; GFX11-FAKE16-NEXT: v_readlane_b32 s63, v43, 13
+; GFX11-FAKE16-NEXT: v_readlane_b32 s73, v43, 14
+; GFX11-FAKE16-NEXT: v_readlane_b32 s13, v43, 15
+; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v43, 16
+; GFX11-FAKE16-NEXT: v_readlane_b32 s41, v43, 17
+; GFX11-FAKE16-NEXT: v_readlane_b32 s43, v43, 18
+; GFX11-FAKE16-NEXT: v_readlane_b32 s56, v43, 19
+; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v43, 20
+; GFX11-FAKE16-NEXT: v_readlane_b32 s57, v43, 21
+; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v43, 22
+; GFX11-FAKE16-NEXT: v_readlane_b32 s74, v43, 23
+; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v43, 24
+; GFX11-FAKE16-NEXT: v_readlane_b32 s75, v43, 25
+; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v43, 26
+; GFX11-FAKE16-NEXT: v_readlane_b32 s76, v43, 27
+; GFX11-FAKE16-NEXT: v_readlane_b32 s77, v43, 28
+; GFX11-FAKE16-NEXT: v_readlane_b32 s78, v43, 29
+; GFX11-FAKE16-NEXT: v_readlane_b32 s79, v43, 30
+; GFX11-FAKE16-NEXT: v_readlane_b32 s88, v43, 31
+; GFX11-FAKE16-NEXT: v_readlane_b32 s89, v42, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s90, v42, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s91, v42, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s92, v42, 3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s47, v42, 4
+; GFX11-FAKE16-NEXT: v_readlane_b32 s93, v42, 5
+; GFX11-FAKE16-NEXT: v_readlane_b32 vcc_hi, v43, 7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s46, v42, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v43, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s95, v42, 7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s45, v42, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v43, 5
+; GFX11-FAKE16-NEXT: .LBB91_5: ; %end
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s104, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s103, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s42, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s4
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s6
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s102, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s58, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s101, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s5
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s100, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s99, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s40, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s4
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s6
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s45, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s59, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s95, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s5
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s4
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s46, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s93, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s14, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s17, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s47, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s72, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s92, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s91, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s90, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s12, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s89, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s60, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s88, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s79, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s78, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s30, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s21, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s77, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s61, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s76, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s75, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s94, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s9, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s62, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s74, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[97:100], off
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:16
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s10, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s57, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s34, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s11, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s25, 0xff
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s63, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s56, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s43, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s41, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, vcc_lo, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s15, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s27, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s13, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s73, 0xff
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_and_b32 v23, 0xff, v23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v113, s1 :: v_dual_lshlrev_b32 v6, 8, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v11, 8, v11
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v115, s3 :: v_dual_and_b32 v96, 0xff, v96
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v96, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v6, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v87
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v86
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v11, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v22, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v26, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v24, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v83
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v22, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v24, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v26, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v6, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v11, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v13, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v15, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v32
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v31
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v81
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v17, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v34
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v33
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v71
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v69
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v14, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v15, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v17, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v19, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v21, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v10, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v36
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 8, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v37
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v49
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v65
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v39
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v10, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v16, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v18, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v19, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v48
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v55
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v52
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v54
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v53
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v51
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 8, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v50
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v18, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v20, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v21, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v22, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v10, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v11, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v5
+; GFX11-FAKE16-NEXT: s_clause 0x5
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[97:100], off offset:32
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:48
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:64
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[6:9], off offset:80
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:96
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
+; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v41, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v41, 7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v41, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v41, 5
+; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v41, 4
+; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v41, 3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v41, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v41, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v41, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v40, 31
+; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v40, 30
+; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v40, 29
+; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v40, 28
+; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v40, 27
+; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v40, 26
+; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v40, 25
+; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v40, 24
+; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v40, 23
+; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v40, 22
+; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v40, 21
+; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v40, 20
+; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v40, 19
+; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v40, 18
+; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v40, 17
+; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v40, 16
+; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v40, 15
+; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v40, 14
+; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v40, 13
+; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v40, 12
+; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v40, 11
+; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v40, 10
+; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v40, 9
+; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v40, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v40, 7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v40, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v40, 5
+; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v40, 4
+; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v40, 3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v40, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1
+; GFX11-FAKE16-NEXT: s_clause 0x3
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:12
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -180286,9 +185691,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v5, 0xffff, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_and_b32 v1, 0xff, v35
; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
@@ -180304,6 +185710,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
@@ -180314,201 +185721,169 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v68
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v118
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v67
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v68
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v69
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v70
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v48
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v7, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v81
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v80
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v55
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v2, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v51
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v83
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v3, v86
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v84
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v85
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v10, v97
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v87
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v99
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v114
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v98
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v97
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v102
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v101
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v113
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v101
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v116
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v14, v128
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v114
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v112
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v102
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v130
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v133
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v14, v132
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v116
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v128
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v134
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v132
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v133
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v130
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v161
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v129
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v147
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v129
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v161
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v166
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v144
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v134
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v147
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v167
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v166
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v144
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v149
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v177
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v180
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v149
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v165
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v42
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v41
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v42
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v115
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v45
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v45
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v44
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v119
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v56
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v59
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v56
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v60
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v61
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v60
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v63
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v62
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v63
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v62
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v73
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v72
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v160
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v176
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v75
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v74
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v75
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v77
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v76
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v77
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v78
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v79
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v88
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v89
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v40
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v91
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v90
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v91
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v92
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v93
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v57
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v93
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB93_3
; GFX11-TRUE16-NEXT: .LBB93_2: ; %cmp.true
@@ -180548,57 +185923,59 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v58
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v91, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v90, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v40
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v43, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v90, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v183
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v89, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v88, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v78, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v79, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v179
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v179, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v77, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
@@ -180607,7 +185984,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v164
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v163
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v76, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -180618,18 +185995,18 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v74, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v73, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v150, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v72, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v146
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v145
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v63, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -180637,13 +186014,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v131
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v62, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v60, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v61, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v119
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v59, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
@@ -180652,29 +186029,29 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v115
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v165
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v56, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v162
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v45, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v44, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v42, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v41, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v180, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -180688,8 +186065,8 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v161, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v118
@@ -180697,167 +186074,141 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v147, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v99
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v130, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v103
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v98
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v54
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v103, 0x300, v0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v128, v3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v99
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v54
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v39
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v113, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 3, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v113, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v128, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v100
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v101, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v102, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v102, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v97, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v55
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v96
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v97, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v100
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xff, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v87, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v51
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v86, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v85, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v84, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v83, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v48
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v39
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v87, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v82, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v81, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v71, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v80, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v86, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v85, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v84, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v49
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v83, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v82, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v81, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v71, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v80, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v69, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v112, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v68, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v67, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v66, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v32
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v65, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v70, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v69, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v35
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v112, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v67, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v68, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, v66, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v116
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v135
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v182
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v65, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v34
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v36.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v33.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v179, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v103.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v114.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.h, v129.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.h, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.h, v144.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.h, v145.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v115.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v119.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.h, v131.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.h, v135.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.h, v150.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.h, v160.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.h, v179.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.h, v181.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l
; GFX11-TRUE16-NEXT: .LBB93_3: ; %end
; GFX11-TRUE16-NEXT: s_clause 0x1e
; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
@@ -202763,9 +208114,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v5, 0xffff, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_and_b32 v1, 0xff, v35
; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
@@ -202781,6 +208133,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
@@ -202791,201 +208144,169 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v68
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v118
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v67
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v68
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v69
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v70
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v48
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v7, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v81
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v80
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v55
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v2, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v51
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v83
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v3, v86
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v84
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v85
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v10, v97
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v87
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v99
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v114
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v98
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v97
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v102
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v101
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v113
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v101
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v116
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v14, v128
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v114
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v112
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v102
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v130
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v133
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v14, v132
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v116
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v128
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v134
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v132
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v133
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v130
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v161
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v129
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v147
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v129
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v161
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v166
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v144
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v134
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v147
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v167
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v166
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v144
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v149
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v177
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v180
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v149
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v165
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v42
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v41
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v42
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v115
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v45
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v45
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v44
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v119
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v56
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v59
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v56
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v60
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v61
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v60
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v63
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v62
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v63
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v62
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v73
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v72
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v160
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v176
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v75
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v74
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v75
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v77
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v76
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v77
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v78
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v79
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v88
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v89
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v40
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v91
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v90
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v91
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v92
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v93
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v57
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v93
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB97_3
; GFX11-TRUE16-NEXT: .LBB97_2: ; %cmp.true
@@ -203025,57 +208346,59 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v58
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v91, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v90, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v40
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v43, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v90, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v183
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v89, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v88, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v78, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v79, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v179
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v179, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v77, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
@@ -203084,7 +208407,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v164
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v163
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v76, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -203095,18 +208418,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v74, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v73, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v150, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v72, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v146
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v145
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v63, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -203114,13 +208437,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v131
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v62, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v60, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v61, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v119
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v59, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
@@ -203129,29 +208452,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v115
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v165
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v56, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v162
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v45, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v44, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v42, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v41, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v180, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -203165,8 +208488,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v161, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v118
@@ -203174,167 +208497,141 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v147, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v99
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v130, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v103
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v98
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v54
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v103, 0x300, v0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v128, v3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v99
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v54
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v39
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v113, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 3, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v113, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v128, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v100
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v101, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v102, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v102, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v97, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v55
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v96
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v97, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v100
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xff, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v87, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v51
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v86, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v85, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v84, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v83, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v48
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v39
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v87, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v82, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v81, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v71, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v80, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v86, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v85, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v84, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v49
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v83, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v82, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v81, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v71, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v80, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v69, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v112, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v68, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v67, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v66, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v32
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v65, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v70, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v69, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v35
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v112, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v67, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v68, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, v66, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v116
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v135
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v182
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v65, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v34
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v36.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v33.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v179, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v103.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v114.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.h, v129.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.h, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.h, v144.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.h, v145.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v115.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v119.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.h, v131.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.h, v135.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.h, v150.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.h, v160.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.h, v179.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.h, v181.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l
; GFX11-TRUE16-NEXT: .LBB97_3: ; %end
; GFX11-TRUE16-NEXT: s_clause 0x1e
; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
@@ -221009,700 +226306,1362 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v64bf16_to_v64f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
-; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
-; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
-; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
-; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
-; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
-; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s15, s3
-; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_mov_b32 s12, s0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB101_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB101_4
-; GFX11-NEXT: .LBB101_2: ; %cmp.true
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v16
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v18
-; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v3, 16, v17
-; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v5, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v5, v4
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v11, v3
-; GFX11-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v5
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v20
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v19
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v2, v4
-; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v8, v3
-; GFX11-NEXT: v_bfe_u32 v8, v2, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v22
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v8, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v3
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v1
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v0
-; GFX11-NEXT: v_bfe_u32 v0, v6, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v23
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, v4, v5
-; GFX11-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v36, 0xffff, v36
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v19, v19, 16, v34
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_bfe_u32 v1, v5, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_nc_u32 v0, v1, v5
-; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v5
-; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v1
-; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_and_b32 v7, 0xffff0000, v25
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v5 :: v_dual_add_nc_u32 v5, 0x7fff, v6
-; GFX11-NEXT: v_add_f32_e32 v6, 0x40c00000, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v3, v6, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_lshlrev_b32 v2, 16, v25
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v5, v7 :: v_dual_add_nc_u32 v1, v3, v6
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v26
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_nc_u32 v3, v3, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT: v_and_b32_e32 v39, 0xffff, v39
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v5, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v6 :: v_dual_and_b32 v5, 0xffff0000, v27
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v2, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v6
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v3, 16, v28
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_and_b32_e32 v49, 0xffff, v49
-; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v0, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v4, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v29
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v5, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v29
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v8, v4
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v2, v5
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v30
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v7
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v31
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_add_f32 v1, 0x40c00000, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v0
-; GFX11-NEXT: v_bfe_u32 v0, v4, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v4
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v3, v5 :: v_dual_add_f32 v3, 0x40c00000, v6
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_bfe_u32 v2, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s12, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v0
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v0
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v6
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_lshl_b32 s0, s13, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v1
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_add_nc_u32 v5, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_lshl_b32 s0, s14, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v8, v7
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v5, v9, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_lshl_b32 s0, s15, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v9
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4
-; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo
-; GFX11-NEXT: s_lshl_b32 s0, s16, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_and_b32 s0, s17, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v5
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v10, v9, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v66, 0xffff, v66
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v6
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v9
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v67, 0xffff, v67
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v54, 0xffff, v54
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v7, v8 :: v_dual_add_nc_u32 v7, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v11, v5
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s18, 16
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
-; GFX11-NEXT: v_bfe_u32 v10, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v8, v9 :: v_dual_add_nc_u32 v8, v10, v11
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: v_bfe_u32 v13, v10, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v7
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v10
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v8
-; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v9, v12, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s20, 16
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: v_bfe_u32 v12, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v9
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v13
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: s_lshl_b32 s0, s21, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_bfe_u32 v15, v12, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v12
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v10
-; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v15
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v12
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v9
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s22, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v13
-; GFX11-NEXT: v_bfe_u32 v14, v9, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v11
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v9
-; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_lshl_b32 s0, s23, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v11
-; GFX11-NEXT: v_bfe_u32 v82, v14, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v12, v15, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v82, v82, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_bfe_u32 v83, v12, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v5, v68, 16, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v82
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v82, v83, v12
-; GFX11-NEXT: v_add_f32_e64 v83, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s24, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v82
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v12
-; GFX11-NEXT: v_bfe_u32 v82, v83, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v13
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s25, 0xffff0000
-; GFX11-NEXT: v_or_b32_e32 v85, 0x400000, v83
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v82, v83
-; GFX11-NEXT: v_bfe_u32 v15, v13, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v82, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
-; GFX11-NEXT: s_lshl_b32 s0, s25, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v13
-; GFX11-NEXT: v_bfe_u32 v86, v82, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v83, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v85 :: v_dual_add_nc_u32 v15, 0x7fff, v15
-; GFX11-NEXT: v_or_b32_e32 v85, 0x400000, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v86, v86, v82
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v14
-; GFX11-NEXT: v_bfe_u32 v14, v83, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v83
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v15, v85, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v86
-; GFX11-NEXT: v_add_f32_e64 v86, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s26, 16
-; GFX11-NEXT: v_or_b32_e32 v85, 0x400000, v82
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-NEXT: v_add_f32_e64 v82, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s27, 16
-; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v86
-; GFX11-NEXT: v_add_f32_e64 v96, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s27, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v97, v82, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v15, v15, v85, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v85, v86, 16, 1
-; GFX11-NEXT: v_bfe_u32 v99, v96, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v98, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v97, v97, v82
-; GFX11-NEXT: v_or_b32_e32 v103, 0x400000, v82
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-NEXT: v_add_nc_u32_e32 v99, v99, v96
-; GFX11-NEXT: v_add_nc_u32_e32 v85, v85, v86
-; GFX11-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v97
-; GFX11-NEXT: v_bfe_u32 v101, v98, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v96
-; GFX11-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v99
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v82, v97, v103 :: v_dual_add_nc_u32 v85, 0x7fff, v85
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96
-; GFX11-NEXT: v_add_nc_u32_e32 v101, v101, v98
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v14, v83
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v82
-; GFX11-NEXT: v_cndmask_b32_e32 v96, v99, v112, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX11-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v101
-; GFX11-NEXT: v_or_b32_e32 v101, 0x400000, v98
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v96
-; GFX11-NEXT: v_cndmask_b32_e32 v85, v85, v102, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX11-NEXT: v_and_b32_e32 v82, 0xffff, v82
-; GFX11-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v65, 16, v67
-; GFX11-NEXT: v_dual_cndmask_b32 v86, v97, v101 :: v_dual_and_b32 v65, 0xffff, v28
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v85
-; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v15
-; GFX11-NEXT: v_and_b32_e32 v80, 0xffff, v80
-; GFX11-NEXT: v_lshrrev_b32_e32 v85, 16, v86
-; GFX11-NEXT: v_cndmask_b32_e32 v14, v14, v100, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v86, 0xffff, v96
-; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v68
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v83, 16, v82
-; GFX11-NEXT: v_lshl_or_b32 v15, v85, 16, v86
-; GFX11-NEXT: v_and_b32_e32 v83, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v86, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v82, 0xffff, v96
-; GFX11-NEXT: v_and_b32_e32 v96, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v85, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v87, 16, v83
-; GFX11-NEXT: v_lshl_or_b32 v10, v9, 16, v86
-; GFX11-NEXT: v_lshl_or_b32 v13, v97, 16, v82
-; GFX11-NEXT: v_and_b32_e32 v82, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v9, v81, 16, v96
-; GFX11-NEXT: v_and_b32_e32 v81, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v83, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v11, v84, 16, v85
-; GFX11-NEXT: v_lshl_or_b32 v6, v69, 16, v82
-; GFX11-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v2, v64, 16, v66
-; GFX11-NEXT: v_and_b32_e32 v64, 0xffff, v29
-; GFX11-NEXT: v_lshl_or_b32 v7, v70, 16, v81
-; GFX11-NEXT: v_and_b32_e32 v70, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v0, v55, 16, v69
-; GFX11-NEXT: v_and_b32_e32 v55, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v28, v51, 16, v64
-; GFX11-NEXT: v_and_b32_e32 v51, 0xffff, v24
-; GFX11-NEXT: v_and_b32_e32 v66, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v27, v50, 16, v65
-; GFX11-NEXT: v_lshl_or_b32 v29, v52, 16, v55
-; GFX11-NEXT: v_and_b32_e32 v50, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v24, v38, 16, v39
-; GFX11-NEXT: v_lshl_or_b32 v22, v37, 16, v51
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v8, v71, 16, v80
-; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v83
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v70
-; GFX11-NEXT: v_lshl_or_b32 v30, v53, 16, v54
-; GFX11-NEXT: v_lshl_or_b32 v26, v26, 16, v66
-; GFX11-NEXT: v_lshl_or_b32 v25, v48, 16, v49
-; GFX11-NEXT: v_lshl_or_b32 v23, v23, 16, v50
-; GFX11-NEXT: v_lshl_or_b32 v21, v21, 16, v52
-; GFX11-NEXT: v_lshl_or_b32 v20, v35, 16, v36
-; GFX11-NEXT: v_lshl_or_b32 v18, v33, 16, v37
-; GFX11-NEXT: v_lshl_or_b32 v17, v17, 16, v38
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB101_3:
-; GFX11-NEXT: s_branch .LBB101_2
-; GFX11-NEXT: .LBB101_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB101_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB101_4
+; GFX11-TRUE16-NEXT: .LBB101_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v18
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v85, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v97, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v87, 0x400000, v85
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v98, v97, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v98, v98, v97
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v98, 0x7fff, v98
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v5, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v32.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v11, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v33.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v4, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v19
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v34.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v2, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v8, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v35.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v23
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, v4, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v36.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v23
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v1, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v37.l
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_f32 v2, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v7, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_and_b32 v7, 0xffff0000, v25
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v38.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v5 :: v_dual_add_nc_u32 v5, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v39.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_lshlrev_b32 v2, 16, v25
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v5, v7 :: v_dual_add_nc_u32 v1, v3, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v26
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_nc_u32 v3, v3, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v48.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v6 :: v_dual_and_b32 v5, 0xffff0000, v27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v7, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v4, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v3, 16, v28
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v50.l
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v49.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v4, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v51.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v0, v0, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v8, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v30
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v52.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v53.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v5 :: v_dual_add_f32 v3, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v5 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v54.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v5, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v55.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v0, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v64.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v7
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v3, v7 :: v_dual_add_nc_u32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v2, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v6 :: v_dual_add_nc_u32 v4, v7, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v4, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v68.l
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v6, v8 :: v_dual_add_nc_u32 v6, v9, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v12, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v6, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v70.l
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v8, v10 :: v_dual_add_nc_u32 v8, v11, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v14, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v9, v13 :: v_dual_add_nc_u32 v12, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v8, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v80.l
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v10, v12 :: v_dual_add_nc_u32 v10, v13, v15
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v82, v82, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v11, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v82
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v82, v10, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v84, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v82
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v84, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 16, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v83.l
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v12, v12, v14 :: v_dual_add_nc_u32 v13, v15, v84
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v82, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v85, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v81.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v84
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v14, v82
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v85
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v82
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v86.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-TRUE16-NEXT: v_bfe_u32 v96, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v14, v84, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s26, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v84, v96, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v87, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v87, v82, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v84
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v99, 0x400000, v82
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 16, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v84, v87, v82
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v87, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v85.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v87, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v84, 0x7fff, v84
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v100, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v87
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v101, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v84, v99, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v99, 0x400000, v97
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v97, v97
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v84, v100, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v87
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v98, v99, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v84, 0x7fff, v84
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v96.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v71.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v97
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v100, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v66.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v84, v101, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v65.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v84
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v87.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB101_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB101_2
+; GFX11-TRUE16-NEXT: .LBB101_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v64f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB101_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB101_4
+; GFX11-FAKE16-NEXT: .LBB101_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v18
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v3, 16, v17
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v5, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v11, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v4, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v20
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v19
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v2, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v8, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v22
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v8, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v23
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, v4, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v34
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_nc_u32 v0, v1, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_and_b32 v7, 0xffff0000, v25
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v5 :: v_dual_add_nc_u32 v5, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_lshlrev_b32 v2, 16, v25
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v5, v7 :: v_dual_add_nc_u32 v1, v3, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_nc_u32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v39
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v3, v6 :: v_dual_and_b32 v5, 0xffff0000, v27
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v7, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v2, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v4, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v3, 16, v28
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v49
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v0, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v4, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v29
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v29
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v8, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v2, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v31
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_add_f32 v1, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v4
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v3, v5 :: v_dual_add_f32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v4, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_add_nc_u32 v5, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v8, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v66
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v9
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v67
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v54
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v7, v8 :: v_dual_add_nc_u32 v7, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v11, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v8, v9 :: v_dual_add_nc_u32 v8, v10, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v13
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v14, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v12, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v82, v82, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v83, v12, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v68, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v82, v83, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v83, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v83, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v13
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, 0x400000, v83
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v82, v83
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v13
+; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v82, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v83, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v85 :: v_dual_add_nc_u32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v86, v86, v82
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v83, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v83
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v15, v85, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v86
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v86, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s26, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, 0x400000, v82
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v86
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v96, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v97, v82, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v15, v85, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v86, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v96, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v98, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v97, v97, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v103, 0x400000, v82
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v99, v99, v96
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v85, v85, v86
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v97
+; GFX11-FAKE16-NEXT: v_bfe_u32 v101, v98, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v96
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v99
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v82, v97, v103 :: v_dual_add_nc_u32 v85, 0x7fff, v85
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v101, v101, v98
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v14, v83
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v96, v99, v112, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v101
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v101, 0x400000, v98
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v96
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v85, v85, v102, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v65, 16, v67
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v86, v97, v101 :: v_dual_and_b32 v65, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v85
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xffff, v80
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v86
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v100, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff, v96
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v1, 16, v68
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v83, 16, v82
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v85, 16, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v83, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff, v96
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v96, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v87, 16, v83
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v9, 16, v86
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v97, 16, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v81, 16, v96
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v83, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v84, 16, v85
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v69, 16, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v64, 16, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v70, 16, v81
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v55, 16, v69
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v50, 16, v65
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v52, 16, v55
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v38, 16, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v51
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v71, 16, v80
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v83
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v70
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v53, 16, v54
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v66
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v49
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v50
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v52
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v17, 16, v38
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB101_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB101_2
+; GFX11-FAKE16-NEXT: .LBB101_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -227079,568 +233038,496 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v17
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v23
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v16 :: v_dual_lshlrev_b32 v35, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v24
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v32, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v52, 16, v24
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v33, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v32
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34
-; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v16, v16, v32, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v33
+; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v33, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v30
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v16, v16, v38
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v17
-; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v34, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v34, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 16, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v17, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v34, 16, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 16, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v17, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_lshlrev_b32 v82, 16, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v30
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v16, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v10
+; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v16, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v37, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_lshlrev_b32 v83, 16, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v33, v38, v48
+; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v34, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v17, v35, v37 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v38, v34, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v34
+; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v18, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v37, v32 :: v_dual_add_f32 v37, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: v_add3_u32 v32, v34, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v35, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v38, v18, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v18
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT: v_add3_u32 v38, v39, v36, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v19, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v19
-; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v37, 16, 1
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v34, v35, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v35
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v48
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v19, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v35, v38, v39 :: v_dual_lshlrev_b32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v19
+; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v37, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v37
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v39 :: v_dual_lshlrev_b32 v39, 16, v20
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v38, 16, 1
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v86, 0x40c00000, v86 :: v_dual_lshlrev_b32 v87, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v38, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v38
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v87, 0x40c00000, v87 :: v_dual_lshlrev_b32 v96, 16, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v37, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v39, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v37, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v36, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v20, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v39
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v48, 0x40c00000, v48 :: v_dual_add_f32 v49, 0x40c00000, v21
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v34, v34, v35 :: v_dual_lshlrev_b32 v21, 16, v22
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v39, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v38, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v48, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v97, 0x400000, v87
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v36, v37, v38, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v38, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v35.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v32.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v37, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v20, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v20
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: v_add3_u32 v48, v49, v38, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v38
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v37, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v21, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v48
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v25
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v48, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v49, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v50
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v21, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v37, v48, v49 :: v_dual_lshlrev_b32 v48, 16, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v38, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v48, 0x40c00000, v48 :: v_dual_cndmask_b32 v21, v39, v49
+; GFX11-TRUE16-NEXT: v_add3_u32 v39, v50, v38, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v38
+; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v22, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v48, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v37.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v39, v49, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v39, v50, v22, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT: v_add3_u32 v50, v51, v48, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v48
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v39, v49, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v23, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v29
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v37, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v49, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v49
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v50, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v22, v37, v38 :: v_dual_lshlrev_b32 v71, 16, v4
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v23
-; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v50, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v50
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_lshlrev_b32 v51, 16, v24
-; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v48, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v37, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v48
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v52
+; GFX11-TRUE16-NEXT: v_add3_u32 v49, v49, v23, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v39, v50, v51 :: v_dual_lshlrev_b32 v50, 16, v25
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v23
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v48, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v50 :: v_dual_cndmask_b32 v23, v49, v51
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v49, v52, v48, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v48
+; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v24, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v51
-; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v48, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v49, 16, 1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v22.h
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v71, 0x40c00000, v71 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v38, v39, v49, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v49
-; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v50, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v49, 0x40c00000, v52 :: v_dual_lshlrev_b32 v52, 16, v25
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v21
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v24, v38, v39 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v50, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v50
-; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v51, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v50, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v49, v51, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v49, v52, v24, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v24
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT: v_add3_u32 v52, v53, v50, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v50
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v49, v51, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v25, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v49, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v54
+; GFX11-TRUE16-NEXT: v_add3_u32 v51, v51, v25, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v49, v52, v53 :: v_dual_lshlrev_b32 v52, 16, v27
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v25
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v54, v50, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX11-TRUE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v52
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v39, v48, v51, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v51
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v39, v48, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v39, v50, v49, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v49
-; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v52, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v52
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v48, v50, v52, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v38.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v51, v53, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v51, v54, v50, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v50
+; GFX11-TRUE16-NEXT: v_bfe_u32 v54, v26, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v52, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v49.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v50, v51, v53, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v51, v54, v26, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v26
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT: v_add3_u32 v54, v55, v52, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v51, v53, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v27, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v26
-; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v51, 16, 1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v84, 0x40c00000, v84 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v53, 0x40c00000, v53 :: v_dual_add_f32 v54, 0x40c00000, v26
-; GFX11-TRUE16-NEXT: v_add3_u32 v49, v50, v51, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27
-; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v53, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v50, v52, v53, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v54, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v26
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_cndmask_b32 v26, v50, v51
-; GFX11-TRUE16-NEXT: v_add3_u32 v50, v52, v54, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v54
-; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v55, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v64
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v28
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_add_f32 v11, 0x40c00000, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v50, v51, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v50, v52, v55, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v55
-; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v53, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v64
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v27.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v28, v50, v51 :: v_dual_and_b32 v13, 0xffff0000, v13
-; GFX11-TRUE16-NEXT: v_add3_u32 v50, v52, v53, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v53
-; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v54, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v55, 16, 1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v26
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v37.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v51, v52, v54, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v54
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v29
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v64
+; GFX11-TRUE16-NEXT: v_add3_u32 v53, v53, v27, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v51, v54, v55 :: v_dual_lshlrev_b32 v54, 16, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v27
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v51, v52, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v51, v53, v55, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v55
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v54, 0x40c00000, v65 :: v_dual_lshlrev_b32 v65, 16, v30
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v64
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v50.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v54, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v65, 0x40c00000, v65
-; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v64, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v28
-; GFX11-TRUE16-NEXT: v_add3_u32 v52, v53, v64, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v64
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v66
+; GFX11-TRUE16-NEXT: v_bfe_u32 v64, v52, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v54
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v48.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v53, v55, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v53, v64, v52, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v52
+; GFX11-TRUE16-NEXT: v_bfe_u32 v64, v28, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v54, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v51.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v52, v53, v55, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v53, v64, v28, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v28
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT: v_add3_u32 v64, v65, v54, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v54
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v53, v55, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v29, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v66
+; GFX11-TRUE16-NEXT: v_add3_u32 v55, v55, v29, 0x7fff
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v31
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v30, v52, v53 :: v_dual_and_b32 v31, 0xffff0000, v31
-; GFX11-TRUE16-NEXT: v_add3_u32 v52, v55, v54, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v54
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v53, v64, v65 :: v_dual_lshlrev_b32 v64, 16, v31
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v29
+; GFX11-TRUE16-NEXT: v_bfe_u32 v66, v54, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v64, 0x40c00000, v64 :: v_dual_cndmask_b32 v29, v55, v65
+; GFX11-TRUE16-NEXT: v_add3_u32 v55, v66, v54, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v54
+; GFX11-TRUE16-NEXT: v_bfe_u32 v66, v30, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v65, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v65
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v66, 0x40c00000, v66 :: v_dual_add_f32 v31, 0x40c00000, v31
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v53, v55, v65, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v64, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v66, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v54, v55, v64, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v64
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v66
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v55, v65, v66, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v67, v64, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v53.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v55, v65, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v55, v66, v30, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v30
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-TRUE16-NEXT: v_add3_u32 v66, v67, v64, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v55, v65, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v31, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v67, 0x40c00000, v67 :: v_dual_add_f32 v66, 0x40c00000, v68
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v55, v64, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v55, v65, v31, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v31
-; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v67, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v68
+; GFX11-TRUE16-NEXT: v_add3_u32 v65, v65, v31, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v55, v66, v67 :: v_dual_lshlrev_b32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v68, v64, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v55, v64, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v55, v65, v67, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v67
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v69
-; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v66, 16, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v55, v64, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v66
-; GFX11-TRUE16-NEXT: v_add3_u32 v55, v65, v66, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v66, 0x40c00000, v66 :: v_dual_cndmask_b32 v31, v65, v67
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v65, v68, v64, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v64
+; GFX11-TRUE16-NEXT: v_bfe_u32 v68, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-TRUE16-NEXT: v_bfe_u32 v69, v66, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v55.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v64, v65, v67, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v65, v68, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v68, v69, v66, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v66
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v65, v67, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v1, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-TRUE16-NEXT: v_bfe_u32 v66, v67, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v69, 0x40c00000, v69
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v55, v64, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v70
+; GFX11-TRUE16-NEXT: v_add3_u32 v65, v65, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v66, v68, v69, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v67, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_add_f32_e32 v68, 0x40c00000, v68
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v55.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v68, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1
-; GFX11-TRUE16-NEXT: v_add3_u32 v64, v65, v68, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v68
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v68, 0x40c00000, v70
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v64, v65 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v64, v66, v67, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v67
-; GFX11-TRUE16-NEXT: v_bfe_u32 v66, v69, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v50.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v65, v69, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v65, v70, v67, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v67
+; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v2, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67
-; GFX11-TRUE16-NEXT: v_bfe_u32 v67, v68, 16, 1
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v70, 0x40c00000, v70 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v64, v64, v65, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v65, v66, v69, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v69
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69
-; GFX11-TRUE16-NEXT: v_bfe_u32 v69, v70, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v65, v65, v66, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v66, v67, v68, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v68
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v68, 0x400000, v70
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v68, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v67, v70, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v65, v65, v69, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v70, v71, v68, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v68
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v66, v66, v67, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v67, v69, v70, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v69, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70
-; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v71, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v67, v68, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v68, v69, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v67, v69, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v67, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v69, 0x40c00000, v80
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v65.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v67, v67, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v70, v71, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v69, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v67
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v68, v69, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v68, v70, v71, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v71
-; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71
-; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v80, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v68, v69, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v69, v70, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v70, 0x40c00000, v70
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v52.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v67, v71, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v67, v80, v69, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v69
+; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v70, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v68.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v69, v80, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v67, v71, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v68
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v69, v70, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v69, v71, v80, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v80
-; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80
-; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v81, 16, 1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v69, v69, v70, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v70, v71, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v80, v81, v70, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v70
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v69, v71, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v69, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v71, 0x40c00000, v82
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v67.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v69, v69, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v80, v81, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 16, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v71, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v69
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v70, v71, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v70, v80, v81, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v81
-; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
-; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v82, 16, 1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v5.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v70, v71, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v71, v80, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v6
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v71, v80, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v71, v81, v82, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v82
-; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v83, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v71, v80, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v80, v81, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_cndmask_b32 v5, v69, v81
+; GFX11-TRUE16-NEXT: v_add3_u32 v69, v82, v71, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v71
+; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71
+; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v80, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v70.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v71, v82, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v69, v69, v81, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v82, v83, v80, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v80
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v71, v81, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v81, 0x40c00000, v84
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v69.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v71, v71, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v82, v83, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v81, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v80, v81, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v80, v82, v83, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v83
-; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v8, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
-; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v84, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v80, v81, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v81, v82, v8, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_cndmask_b32 v7, v71, v83
+; GFX11-TRUE16-NEXT: v_add3_u32 v71, v84, v81, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v81
+; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v82, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v80.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v81, v84, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v71, v83, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v81, v82, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v81, v83, v84, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v84
-; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v9, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84
-; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v85, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v81, v82, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v82, v83, v9, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_add3_u32 v84, v85, v82, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v82
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v81, v83, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v83, 0x40c00000, v86
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v71.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v81, v81, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v84, v85, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v83, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v82, v83, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v82, v84, v85, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v85
-; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v10, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
-; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v86, 16, 1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v81
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v82, v83, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v83, v84, v10, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v84, 0x40c00000, v84
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v54.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v81, v85, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v81, v86, v83, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v83
+; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-TRUE16-NEXT: v_bfe_u32 v87, v84, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v82.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v83, v86, v10, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v81, v85, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v10
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v7, 16, v9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v83, v84, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v83, v85, v86, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v86
-; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v11, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v86, 0x40c00000, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
-; GFX11-TRUE16-NEXT: v_add3_u32 v85, v85, v11, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v83, v84, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v87, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v86, v87, v84, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v87, 0x400000, v84
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v83, v85, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v85, 0x40c00000, v96
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v81.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v83, v83, v11, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v86, v87, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v96, v85, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT: v_bfe_u32 v99, v86, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v82
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v84, v84, v87, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v85, v96, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87
-; GFX11-TRUE16-NEXT: v_add3_u32 v87, v99, v86, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v86
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v6, 16, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v11.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v84, v97, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v83
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v87, v96, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v83, v86, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v83, 0x40c00000, v87
+; GFX11-TRUE16-NEXT: v_add3_u32 v86, v96, v85, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v87, 0x400000, v85
+; GFX11-TRUE16-NEXT: v_bfe_u32 v96, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-TRUE16-NEXT: v_bfe_u32 v97, v83, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v98, 0x400000, v83
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v84.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v86, v87, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v86, v96, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v87, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_add3_u32 v96, v97, v83, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v99, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v86, v87, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v86, 0x40c00000, v97
+; GFX11-TRUE16-NEXT: v_add3_u32 v87, v99, v13, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v85.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v96, v98, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v97, v86, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v5, 16, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v70
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v96, 0x40c00000, v96 :: v_dual_add_f32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v5, 16, v6
-; GFX11-TRUE16-NEXT: v_bfe_u32 v101, v96, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v102, v15, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v113, 0x400000, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v114, 0x400000, v96
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v17, 16, v69
-; GFX11-TRUE16-NEXT: v_add3_u32 v101, v101, v96, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v102, v102, v15, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v66.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v27, 16, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v51.h
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v29
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v66
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v98, v12, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v97, 0x400000, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v27, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v38.h
-; GFX11-TRUE16-NEXT: v_add3_u32 v85, v98, v12, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v24
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v66.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v87, v96, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v87, 0x40c00000, v98
-; GFX11-TRUE16-NEXT: v_bfe_u32 v98, v13, 16, 1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v34.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-TRUE16-NEXT: v_add3_u32 v96, v97, v86, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v97, 0x400000, v86
+; GFX11-TRUE16-NEXT: v_bfe_u32 v98, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
; GFX11-TRUE16-NEXT: v_bfe_u32 v99, v87, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v98, v98, v13, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v103, 0x400000, v87
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18
-; GFX11-TRUE16-NEXT: v_add3_u32 v99, v99, v87, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-TRUE16-NEXT: v_bfe_u32 v100, v14, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v83.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v96, v97, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v97, v98, v14, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v98, v99, v87, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v99, 0x400000, v87
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-TRUE16-NEXT: v_bfe_u32 v96, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v101, 0x400000, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v98, v99, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v100, v100, v14, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v100, v112, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v96, v96, v15, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v97, v100, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v14.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v102, v113, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v101, v114, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v96
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v99, v103, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v99, 0x400000, v13
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v3, 16, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v87
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v98, v99, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v4, 16, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v13.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v85, v97, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v86
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v12.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v84
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v3, 16, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v80
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v4, 16, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v3, 16, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v4, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v68
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v64.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v17, 16, v65
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v54.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v3, 16, v67
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v22, 16, v64
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v52.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v17, 16, v53
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v49.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v22, 16, v52
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v25
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v17, 16, v48
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v36.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v22, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v17, 16, v35
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v37, 16, v38
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v86.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v39, 16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v96, v101, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v87.h
; GFX11-TRUE16-NEXT: .LBB104_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -230606,641 +236493,1242 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v64bf16_to_v64i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
-; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
-; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
-; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
-; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
-; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
-; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s15, s3
-; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_mov_b32 s12, s0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB105_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB105_4
-; GFX11-NEXT: .LBB105_2: ; %cmp.true
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v16
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v18
-; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s24, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v17
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v0
-; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v4
-; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v5, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v11, v3
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v6, v9, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v7, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v6 :: v_dual_add_nc_u32 v2, v2, v0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v1, v5 :: v_dual_lshlrev_b32 v5, 16, v19
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v2, v6 :: v_dual_add_nc_u32 v2, v7, v3
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v19, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v0, v1, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v21
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v20
-; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v6
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v1, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v35, v0, v6, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v22
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v0, v1, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v21, v0, v1, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v22
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v23
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v23
-; GFX11-NEXT: v_cndmask_b32_e32 v22, v1, v3, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v37, v0, v6 :: v_dual_add_nc_u32 v0, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_cndmask_b32 v23, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v38, v0, v1, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v24
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v25
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v25
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v24, v1, v3
-; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v7 :: v_dual_add_f32 v3, 0x40c00000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_cndmask_b32 v39, v0, v6 :: v_dual_add_nc_u32 v0, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v25, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v26
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v48, v0, v1, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v27
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v27
-; GFX11-NEXT: v_cndmask_b32_e32 v49, v1, v3, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v49
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v28
-; GFX11-NEXT: v_dual_cndmask_b32 v26, v0, v6 :: v_dual_add_nc_u32 v1, v1, v4
-; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v27, v0, v1 :: v_dual_lshlrev_b32 v6, 16, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v50, v0, v1, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v29
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v28, v1, v3 :: v_dual_lshlrev_b32 v5, 16, v29
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v51, v0, v6, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_dual_cndmask_b32 v29, v0, v1 :: v_dual_lshlrev_b32 v6, 16, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v52, v0, v1 :: v_dual_add_nc_u32 v1, 0x7fff, v4
-; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v31
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_cndmask_b32 v30, v1, v3 :: v_dual_add_f32 v3, 0x40c00000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v31
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v53, v0, v5 :: v_dual_add_f32 v0, 0x40c00000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v3
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT: s_lshl_b32 s0, s12, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v54, v1, v2 :: v_dual_add_nc_u32 v1, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v5, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v54
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v1, v2, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s13, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v1
-; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v55, v2, v3, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v6
-; GFX11-NEXT: s_lshl_b32 s0, s14, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v3
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v9, v4, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: s_lshl_b32 s0, s15, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v4
-; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v64, v5, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v10
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_lshl_b32 s0, s16, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v65, v6, v7, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v65
-; GFX11-NEXT: v_cndmask_b32_e32 v66, v4, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_and_b32 s0, s17, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v9, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v67, v6, v7, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v11
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v4
-; GFX11-NEXT: v_bfe_u32 v8, v10, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v67
-; GFX11-NEXT: v_cndmask_b32_e32 v68, v5, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v10
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_lshl_b32 s0, s18, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v5
-; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v69, v6, v7, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v11
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v10, v6
-; GFX11-NEXT: v_bfe_u32 v10, v12, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v70, v7, v8, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v12
-; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_lshl_b32 s0, s20, 16
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v7
-; GFX11-NEXT: v_and_or_b32 v5, 0xffff0000, v69, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v11, v7
-; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v13
-; GFX11-NEXT: v_bfe_u32 v71, v11, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v80, v8, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v13
-; GFX11-NEXT: s_lshl_b32 s0, s21, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v9, v15, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v71, v11
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v11
-; GFX11-NEXT: v_bfe_u32 v71, v12, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v10 :: v_dual_add_nc_u32 v9, 0x7fff, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v10, v14, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s22, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v81, v9, v15, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v71, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v14
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v12
-; GFX11-NEXT: v_bfe_u32 v71, v11, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v14
-; GFX11-NEXT: v_bfe_u32 v83, v13, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v71, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v83, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v82, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v71, 0x400000, v11
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: s_lshl_b32 s0, s23, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v13
-; GFX11-NEXT: v_bfe_u32 v83, v15, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v84, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v71, v12, v71, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v83, v15
-; GFX11-NEXT: v_bfe_u32 v13, v84, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v83, 0x400000, v15
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v14, v82, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v82, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v84
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT: v_bfe_u32 v85, v14, 16, 1
-; GFX11-NEXT: v_bfe_u32 v86, v82, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s25, 0xffff0000
-; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v82
-; GFX11-NEXT: v_dual_cndmask_b32 v83, v12, v83 :: v_dual_add_nc_u32 v12, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v84
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v85, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84
-; GFX11-NEXT: v_add_nc_u32_e32 v85, v86, v82
-; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-NEXT: v_dual_cndmask_b32 v12, v12, v13 :: v_dual_add_nc_u32 v15, 0x7fff, v15
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v85, 0x7fff, v85
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s25, 16
-; GFX11-NEXT: v_and_or_b32 v6, 0xffff0000, v70, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v84, v15, v84, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-NEXT: v_add_f32_e64 v87, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v86, v13, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v82, v85, v96, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v85, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s26, 16
-; GFX11-NEXT: v_bfe_u32 v15, v87, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v96, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s27, 16
-; GFX11-NEXT: v_bfe_u32 v97, v85, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v98, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s27, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v99, v96, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v100, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v113, 0x400000, v96
-; GFX11-NEXT: v_bfe_u32 v101, v98, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96
-; GFX11-NEXT: v_add_nc_u32_e32 v99, v99, v96
-; GFX11-NEXT: v_add_nc_u32_e32 v97, v97, v85
-; GFX11-NEXT: v_bfe_u32 v103, v100, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v101, v101, v98
-; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v98
-; GFX11-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v99
-; GFX11-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v97
-; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v85
-; GFX11-NEXT: v_add_nc_u32_e32 v101, 0x7fff, v101
-; GFX11-NEXT: v_add_nc_u32_e32 v103, v103, v100
-; GFX11-NEXT: v_cndmask_b32_e32 v96, v99, v113, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v87
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v86, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v103
-; GFX11-NEXT: v_or_b32_e32 v103, 0x400000, v100
-; GFX11-NEXT: v_cndmask_b32_e32 v98, v101, v114, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v87
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v13
-; GFX11-NEXT: v_cndmask_b32_e32 v85, v97, v112, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v100, v100
-; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v96
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v64, v65
-; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v55, v69
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v30
-; GFX11-NEXT: v_cndmask_b32_e32 v97, v99, v103, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v28
-; GFX11-NEXT: v_and_or_b32 v4, 0xffff0000, v68, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v87, v15, v102, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v66, v67
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v27
-; GFX11-NEXT: v_and_or_b32 v29, 0xffff0000, v52, v55
-; GFX11-NEXT: v_and_or_b32 v28, 0xffff0000, v51, v64
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v14, v86, vcc_lo
-; GFX11-NEXT: v_and_or_b32 v14, 0xffff0000, v85, v96
-; GFX11-NEXT: v_lshrrev_b32_e32 v85, 16, v87
-; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v11
-; GFX11-NEXT: v_and_or_b32 v27, 0xffff0000, v50, v65
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v98
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v82
-; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v10
-; GFX11-NEXT: v_and_or_b32 v10, 0xffff0000, v71, v87
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v81
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: v_and_or_b32 v30, 0xffff0000, v53, v54
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v22
-; GFX11-NEXT: v_and_or_b32 v25, 0xffff0000, v48, v49
-; GFX11-NEXT: v_and_or_b32 v24, 0xffff0000, v39, v50
-; GFX11-NEXT: v_and_or_b32 v23, 0xffff0000, v38, v51
-; GFX11-NEXT: v_and_or_b32 v22, 0xffff0000, v37, v52
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v17
-; GFX11-NEXT: v_and_or_b32 v15, 0xffff0000, v97, v98
-; GFX11-NEXT: v_and_or_b32 v13, 0xffff0000, v13, v85
-; GFX11-NEXT: v_and_or_b32 v12, 0xffff0000, v84, v82
-; GFX11-NEXT: v_and_or_b32 v11, 0xffff0000, v83, v86
-; GFX11-NEXT: v_and_or_b32 v9, 0xffff0000, v9, v96
-; GFX11-NEXT: v_and_or_b32 v8, 0xffff0000, v8, v71
-; GFX11-NEXT: v_and_or_b32 v7, 0xffff0000, v80, v7
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v1, v68
-; GFX11-NEXT: v_and_or_b32 v31, 0xffff0000, v31, v70
-; GFX11-NEXT: v_and_or_b32 v26, 0xffff0000, v26, v66
-; GFX11-NEXT: v_and_or_b32 v21, 0xffff0000, v21, v53
-; GFX11-NEXT: v_and_or_b32 v20, 0xffff0000, v35, v36
-; GFX11-NEXT: v_and_or_b32 v19, 0xffff0000, v34, v37
-; GFX11-NEXT: v_and_or_b32 v18, 0xffff0000, v33, v38
-; GFX11-NEXT: v_and_or_b32 v17, 0xffff0000, v32, v39
-; GFX11-NEXT: v_and_or_b32 v16, 0xffff0000, v16, v48
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB105_3:
-; GFX11-NEXT: s_branch .LBB105_2
-; GFX11-NEXT: .LBB105_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB105_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB105_4
+; GFX11-TRUE16-NEXT: .LBB105_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v18
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v3, 16, v17
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_cndmask_b32 v16, v5, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v11, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v0, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v19
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v5, v7 :: v_dual_and_b32 v7, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v7 :: v_dual_add_nc_u32 v0, v5, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v34.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v6, 16, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v35.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v32.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v23
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v36.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v5, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v23
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v37, v0, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v37.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v24
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v7 :: v_dual_add_nc_u32 v0, v5, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v38.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v24, v4, v7 :: v_dual_and_b32 v7, 0xffff0000, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v48, v0, v1 :: v_dual_add_f32 v1, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v48.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v25
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v25, v4, v8 :: v_dual_add_nc_u32 v0, v5, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v49, v0, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v49.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v26, v4, v7 :: v_dual_and_b32 v7, 0xffff0000, v28
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v28
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v50, v0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v7 :: v_dual_add_nc_u32 v0, v5, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v29
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v50.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v51, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v51.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v30
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v7, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v52, v0, v4, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v52.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v31
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v53, v2, v6 :: v_dual_lshlrev_b32 v2, 16, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v53.h
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v54, v0, v4 :: v_dual_add_nc_u32 v1, v1, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v64, v0, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v0, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v3 :: v_dual_add_nc_u32 v1, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v64.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v2, v8 :: v_dual_add_nc_u32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v65, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v2, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v7 :: v_dual_add_nc_u32 v3, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v66, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v54.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v66.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v9 :: v_dual_add_nc_u32 v4, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v4, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v8
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v5, v9 :: v_dual_add_nc_u32 v5, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v67.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v68.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v6, v11 :: v_dual_add_nc_u32 v6, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v69, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v6, v9
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v7, v11 :: v_dual_add_nc_u32 v7, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v14
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v7, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v69.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v70.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v8, v13 :: v_dual_add_nc_u32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v15
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v80, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v8, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v12
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v81, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v9, v13 :: v_dual_add_nc_u32 v9, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v80, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v13, v80
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v81, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v9, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v80
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v71.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v81
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v82.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v10, v15 :: v_dual_add_nc_u32 v10, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v81
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v84, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v83, v10, v13
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v84, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v15, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v83
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v83, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v81, v81, v84
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v12, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v83, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v84
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v85, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v12, v83
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v81, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v83
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v13, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v81, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v87, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v13, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v14, v81
+; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v87, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v12, v85, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v83, v83, v87
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v84.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v13, v85, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s26, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v86.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v85.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v14, v96 :: v_dual_add_nc_u32 v14, 0x7fff, v83
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v83, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v87
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v97, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-TRUE16-NEXT: v_bfe_u32 v98, v83, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v100, v97, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v14, v96, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v98, v83
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v99, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v101, 0x400000, v83
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v98, v100, v97
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v81, v81, v15
+; GFX11-TRUE16-NEXT: v_bfe_u32 v96, v99, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v98, 0x7fff, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v102, 0x400000, v97
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v14, v101, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v97, v97
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v81, 0x7fff, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v96, v96, v99
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v101, 0x400000, v99
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v98, v102, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v87.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v96, 0x7fff, v96
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v80.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v65.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v81, v100, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v99, v99
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v83.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v55.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v96, v101, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v97.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB105_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB105_2
+; GFX11-TRUE16-NEXT: .LBB105_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v64i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB105_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB105_4
+; GFX11-FAKE16-NEXT: .LBB105_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v18
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s24, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v17
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v11, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v7, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v6 :: v_dual_add_nc_u32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v1, v5 :: v_dual_lshlrev_b32 v5, 16, v19
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v2, v6 :: v_dual_add_nc_u32 v2, v7, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v19, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v21
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v20
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v7, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v22
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v22
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v23
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v7, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v37, v0, v6 :: v_dual_add_nc_u32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v23, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v25
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v25
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v24, v1, v3
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v7 :: v_dual_add_f32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v7, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v39, v0, v6 :: v_dual_add_nc_u32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v25, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v26
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v27
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v49
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v7, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v28
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v0, v6 :: v_dual_add_nc_u32 v1, v1, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v0, v1 :: v_dual_lshlrev_b32 v6, 16, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v29
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v28, v1, v3 :: v_dual_lshlrev_b32 v5, 16, v29
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v7, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v29, v0, v1 :: v_dual_lshlrev_b32 v6, 16, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v52, v0, v1 :: v_dual_add_nc_u32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v31
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v1, v3 :: v_dual_add_f32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v31
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v0, v5 :: v_dual_add_f32 v0, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v54, v1, v2 :: v_dual_add_nc_u32 v1, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v54
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v67
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v68, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v69, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v10, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, 0xffff0000, v69, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v11, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v13
+; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v80, v8, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v13
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v9, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v71, v11
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v10 :: v_dual_add_nc_u32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v14, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v81, v9, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v71, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v14
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v83, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v71, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v83, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v82, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_bfe_u32 v83, v15, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v84, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v71, v12, v71, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v83, v15
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v84, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v83, 0x400000, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v14, v82, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v84
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v14, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v82, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v82
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v83, v12, v83 :: v_dual_add_nc_u32 v12, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v84
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v85, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v85, v86, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v12, v13 :: v_dual_add_nc_u32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v85, 0x7fff, v85
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-FAKE16-NEXT: v_and_or_b32 v6, 0xffff0000, v70, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v84, v15, v84, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v87, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v82, v85, v96, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v85, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s26, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v87, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v96, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v97, v85, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v98, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v96, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v100, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v113, 0x400000, v96
+; GFX11-FAKE16-NEXT: v_bfe_u32 v101, v98, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v99, v99, v96
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v97, v97, v85
+; GFX11-FAKE16-NEXT: v_bfe_u32 v103, v100, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v101, v101, v98
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v98
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v99
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v97
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v85
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v101, 0x7fff, v101
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v103, v103, v100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v96, v99, v113, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v87
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v86, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v103
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v103, 0x400000, v100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v98, v101, v114, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v87
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v85, v97, v112, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v100, v100
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v96
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v64, v65
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v55, v69
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v97, v99, v103, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v28
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0xffff0000, v68, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v87, v15, v102, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v66, v67
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v27
+; GFX11-FAKE16-NEXT: v_and_or_b32 v29, 0xffff0000, v52, v55
+; GFX11-FAKE16-NEXT: v_and_or_b32 v28, 0xffff0000, v51, v64
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v14, v86, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_or_b32 v14, 0xffff0000, v85, v96
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v87
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v11
+; GFX11-FAKE16-NEXT: v_and_or_b32 v27, 0xffff0000, v50, v65
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v98
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v10
+; GFX11-FAKE16-NEXT: v_and_or_b32 v10, 0xffff0000, v71, v87
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v81
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: v_and_or_b32 v30, 0xffff0000, v53, v54
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v22
+; GFX11-FAKE16-NEXT: v_and_or_b32 v25, 0xffff0000, v48, v49
+; GFX11-FAKE16-NEXT: v_and_or_b32 v24, 0xffff0000, v39, v50
+; GFX11-FAKE16-NEXT: v_and_or_b32 v23, 0xffff0000, v38, v51
+; GFX11-FAKE16-NEXT: v_and_or_b32 v22, 0xffff0000, v37, v52
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v17
+; GFX11-FAKE16-NEXT: v_and_or_b32 v15, 0xffff0000, v97, v98
+; GFX11-FAKE16-NEXT: v_and_or_b32 v13, 0xffff0000, v13, v85
+; GFX11-FAKE16-NEXT: v_and_or_b32 v12, 0xffff0000, v84, v82
+; GFX11-FAKE16-NEXT: v_and_or_b32 v11, 0xffff0000, v83, v86
+; GFX11-FAKE16-NEXT: v_and_or_b32 v9, 0xffff0000, v9, v96
+; GFX11-FAKE16-NEXT: v_and_or_b32 v8, 0xffff0000, v8, v71
+; GFX11-FAKE16-NEXT: v_and_or_b32 v7, 0xffff0000, v80, v7
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v1, v68
+; GFX11-FAKE16-NEXT: v_and_or_b32 v31, 0xffff0000, v31, v70
+; GFX11-FAKE16-NEXT: v_and_or_b32 v26, 0xffff0000, v26, v66
+; GFX11-FAKE16-NEXT: v_and_or_b32 v21, 0xffff0000, v21, v53
+; GFX11-FAKE16-NEXT: v_and_or_b32 v20, 0xffff0000, v35, v36
+; GFX11-FAKE16-NEXT: v_and_or_b32 v19, 0xffff0000, v34, v37
+; GFX11-FAKE16-NEXT: v_and_or_b32 v18, 0xffff0000, v33, v38
+; GFX11-FAKE16-NEXT: v_and_or_b32 v17, 0xffff0000, v32, v39
+; GFX11-FAKE16-NEXT: v_and_or_b32 v16, 0xffff0000, v16, v48
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB105_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB105_2
+; GFX11-FAKE16-NEXT: .LBB105_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index 582f31b..c6211aa 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -3090,108 +3090,206 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v8bf16_to_v4i32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB23_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB23_4
-; GFX11-NEXT: .LBB23_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s4, s3, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s3
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v7, 16, v8
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB23_3:
-; GFX11-NEXT: s_branch .LBB23_2
-; GFX11-NEXT: .LBB23_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v4i32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-TRUE16-NEXT: .LBB23_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v9, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v10, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v1, v12, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v10.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v5.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB23_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB23_2
+; GFX11-TRUE16-NEXT: .LBB23_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v4i32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-FAKE16-NEXT: .LBB23_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s3, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v3
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v7, 16, v8
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB23_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB23_2
+; GFX11-FAKE16-NEXT: .LBB23_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -7531,108 +7629,206 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v8bf16_to_v4f32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB47_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB47_4
-; GFX11-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s4, s3, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s3
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v7, 16, v8
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB47_3:
-; GFX11-NEXT: s_branch .LBB47_2
-; GFX11-NEXT: .LBB47_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v4f32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_4
+; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v9, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v10, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v1, v12, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v10.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v5.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB47_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB47_2
+; GFX11-TRUE16-NEXT: .LBB47_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v4f32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_4
+; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s3, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v3
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v7, 16, v8
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB47_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB47_2
+; GFX11-FAKE16-NEXT: .LBB47_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -11622,108 +11818,206 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v8bf16_to_v2i64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB67_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB67_4
-; GFX11-NEXT: .LBB67_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s4, s3, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s3
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v7, 16, v8
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB67_3:
-; GFX11-NEXT: s_branch .LBB67_2
-; GFX11-NEXT: .LBB67_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v2i64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB67_4
+; GFX11-TRUE16-NEXT: .LBB67_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v9, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v10, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v1, v12, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v10.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v5.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB67_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB67_2
+; GFX11-TRUE16-NEXT: .LBB67_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v2i64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB67_4
+; GFX11-FAKE16-NEXT: .LBB67_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s3, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v3
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v7, 16, v8
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB67_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB67_2
+; GFX11-FAKE16-NEXT: .LBB67_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -15292,108 +15586,206 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v8bf16_to_v2f64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB83_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB83_4
-; GFX11-NEXT: .LBB83_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s4, s3, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s3
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v7, 16, v8
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB83_3:
-; GFX11-NEXT: s_branch .LBB83_2
-; GFX11-NEXT: .LBB83_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v2f64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB83_4
+; GFX11-TRUE16-NEXT: .LBB83_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v9, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v10, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v1, v12, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v10.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v5.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB83_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB83_2
+; GFX11-TRUE16-NEXT: .LBB83_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v2f64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB83_4
+; GFX11-FAKE16-NEXT: .LBB83_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s3, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v3
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v7, 16, v8
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB83_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB83_2
+; GFX11-FAKE16-NEXT: .LBB83_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -18154,83 +18546,75 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v11, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v8, v11, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v7, v8 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v4 :: v_dual_add_f32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v12, v13, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
-; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v12, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v13, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v11, v12 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v10, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v10, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v13, v14, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v11, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v9, v15, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v0, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v1, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v4, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
; GFX11-TRUE16-NEXT: .LBB94_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -18580,104 +18964,191 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v8bf16_to_v8i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB95_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB95_4
-; GFX11-NEXT: .LBB95_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v2
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s3, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4
-; GFX11-NEXT: v_bfe_u32 v10, v7, 16, 1
-; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v10, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v9
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
-; GFX11-NEXT: v_bfe_u32 v3, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v5
-; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v3, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v5, v4
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v3, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v6, v8
-; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v9
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB95_3:
-; GFX11-NEXT: s_branch .LBB95_2
-; GFX11-NEXT: .LBB95_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v8i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB95_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB95_4
+; GFX11-TRUE16-NEXT: .LBB95_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v5, v8 :: v_dual_add_nc_u32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v10, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v12, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v2, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB95_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB95_2
+; GFX11-TRUE16-NEXT: .LBB95_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v8i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB95_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB95_4
+; GFX11-FAKE16-NEXT: .LBB95_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v10, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v5, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v3, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v6, v8
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v9
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB95_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB95_2
+; GFX11-FAKE16-NEXT: .LBB95_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -21477,112 +21948,210 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v8bf16_to_v8f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB103_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB103_4
-; GFX11-NEXT: .LBB103_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v2
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s3, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4
-; GFX11-NEXT: v_bfe_u32 v10, v7, 16, 1
-; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v10, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v9
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
-; GFX11-NEXT: v_bfe_u32 v3, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v5
-; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v11
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v4
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v6, 16, v7
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB103_3:
-; GFX11-NEXT: s_branch .LBB103_2
-; GFX11-NEXT: .LBB103_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v8f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB103_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB103_4
+; GFX11-TRUE16-NEXT: .LBB103_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s1, 0, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v10, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v7, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v5, v11 :: v_dual_add_nc_u32 v10, v10, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v12, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v10, v11 :: v_dual_add_nc_u32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v9.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v7.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB103_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB103_2
+; GFX11-TRUE16-NEXT: .LBB103_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v8f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB103_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB103_4
+; GFX11-FAKE16-NEXT: .LBB103_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v10, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v8, 16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v3, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v6, 16, v7
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB103_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB103_2
+; GFX11-FAKE16-NEXT: .LBB103_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -24345,152 +24914,299 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v12, v16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v8bf16_to_v16i8_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-NEXT: s_mov_b32 s8, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB109_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s11, s3, 24
-; GFX11-NEXT: s_lshr_b32 s18, s3, 16
-; GFX11-NEXT: s_lshr_b32 s14, s3, 8
-; GFX11-NEXT: s_lshr_b32 s16, s2, 16
-; GFX11-NEXT: s_lshr_b32 s15, s2, 8
-; GFX11-NEXT: s_lshr_b32 s9, s1, 24
-; GFX11-NEXT: s_lshr_b32 s17, s1, 16
-; GFX11-NEXT: s_lshr_b32 s10, s1, 8
-; GFX11-NEXT: s_lshr_b32 s13, s0, 16
-; GFX11-NEXT: s_lshr_b32 s12, s0, 8
-; GFX11-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT: s_cbranch_vccnz .LBB109_4
-; GFX11-NEXT: .LBB109_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s4, s1, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: s_lshl_b32 s0, s3, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s2, 16
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s2
-; GFX11-NEXT: v_bfe_u32 v10, v7, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v7
-; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2
-; GFX11-NEXT: v_bfe_u32 v2, v3, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v11
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v16
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v10, v14, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v10
-; GFX11-NEXT: v_lshl_or_b32 v9, v3, 16, v7
-; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v9
-; GFX11-NEXT: s_branch .LBB109_5
-; GFX11-NEXT: .LBB109_3:
-; GFX11-NEXT: ; implicit-def: $sgpr12
-; GFX11-NEXT: ; implicit-def: $sgpr13
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr10
-; GFX11-NEXT: ; implicit-def: $sgpr17
-; GFX11-NEXT: ; implicit-def: $sgpr9
-; GFX11-NEXT: ; implicit-def: $sgpr15
-; GFX11-NEXT: ; implicit-def: $sgpr16
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: ; implicit-def: $sgpr14
-; GFX11-NEXT: ; implicit-def: $sgpr18
-; GFX11-NEXT: ; implicit-def: $sgpr11
-; GFX11-NEXT: s_branch .LBB109_2
-; GFX11-NEXT: .LBB109_4:
-; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s1
-; GFX11-NEXT: v_dual_mov_b32 v16, s3 :: v_dual_mov_b32 v9, s15
-; GFX11-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s11
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v13, s14
-; GFX11-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v1, s12
-; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v7, s9
-; GFX11-NEXT: v_dual_mov_b32 v2, s13 :: v_dual_mov_b32 v5, s10
-; GFX11-NEXT: v_mov_b32_e32 v11, s6
-; GFX11-NEXT: v_mov_b32_e32 v3, s4
-; GFX11-NEXT: .LBB109_5: ; %end
-; GFX11-NEXT: v_mov_b32_e32 v4, v17
-; GFX11-NEXT: v_mov_b32_e32 v12, v16
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v16i8_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s3, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s3, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s2, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s1, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s1, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB109_4
+; GFX11-TRUE16-NEXT: .LBB109_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s1, 16
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s1, 0, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s2, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v2 :: v_dual_add_nc_u32 v1, v8, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v14.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v16.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v3.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT: s_branch .LBB109_5
+; GFX11-TRUE16-NEXT: .LBB109_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr18
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT: s_branch .LBB109_2
+; GFX11-TRUE16-NEXT: .LBB109_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s3 :: v_dual_mov_b32 v9, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v13, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v1, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v7, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s13 :: v_dual_mov_b32 v5, s10
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT: .LBB109_5: ; %end
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v17
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, v16
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v16i8_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s3, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s3, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s2, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s1, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB109_4
+; GFX11-FAKE16-NEXT: .LBB109_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s1, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v14, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT: s_branch .LBB109_5
+; GFX11-FAKE16-NEXT: .LBB109_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr18
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT: s_branch .LBB109_2
+; GFX11-FAKE16-NEXT: .LBB109_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s3 :: v_dual_mov_b32 v9, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s11
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v13, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v1, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v7, s9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s13 :: v_dual_mov_b32 v5, s10
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT: .LBB109_5: ; %end
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v17
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v16
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index 0a73571..01e397d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -4485,203 +4485,384 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v7, s23
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v16bf16_to_v8i32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s7, s19
-; GFX11-NEXT: s_mov_b32 s6, s18
-; GFX11-NEXT: s_mov_b32 s5, s17
-; GFX11-NEXT: s_mov_b32 s4, s16
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_mov_b32 s8, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB23_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT: s_cbranch_vccnz .LBB23_4
-; GFX11-NEXT: .LBB23_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s8, s7, 16
-; GFX11-NEXT: s_and_b32 s7, s7, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
-; GFX11-NEXT: s_and_b32 s8, s6, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_and_b32 s7, s5, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
-; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v11, v6
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v9, v10
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s3
-; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v6, v8
-; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v10
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v8
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v3, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s2
-; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v9
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_bfe_u32 v11, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_bfe_u32 v16, v14, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
-; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v0, v12, 16, v9
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB23_3:
-; GFX11-NEXT: s_branch .LBB23_2
-; GFX11-NEXT: .LBB23_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v8i32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-TRUE16-NEXT: .LBB23_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s7, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s6, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s5, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v6, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v3, v9, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v10, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v13, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v14, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v10, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v1, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB23_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB23_2
+; GFX11-TRUE16-NEXT: .LBB23_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v8i32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-FAKE16-NEXT: .LBB23_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s7, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s6, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s5, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v11, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v9, v10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v6, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v14, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, v16, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v8, 16, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v12, 16, v9
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB23_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB23_2
+; GFX11-FAKE16-NEXT: .LBB23_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -11456,203 +11637,384 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a
; GFX9-NEXT: v_mov_b32_e32 v7, s23
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v16bf16_to_v8f32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s7, s19
-; GFX11-NEXT: s_mov_b32 s6, s18
-; GFX11-NEXT: s_mov_b32 s5, s17
-; GFX11-NEXT: s_mov_b32 s4, s16
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_mov_b32 s8, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB47_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT: s_cbranch_vccnz .LBB47_4
-; GFX11-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s8, s7, 16
-; GFX11-NEXT: s_and_b32 s7, s7, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
-; GFX11-NEXT: s_and_b32 s8, s6, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_and_b32 s7, s5, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
-; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v11, v6
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v9, v10
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s3
-; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v6, v8
-; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v10
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v8
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v3, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s2
-; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v9
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_bfe_u32 v11, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_bfe_u32 v16, v14, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
-; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v0, v12, 16, v9
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB47_3:
-; GFX11-NEXT: s_branch .LBB47_2
-; GFX11-NEXT: .LBB47_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v8f32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_4
+; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s7, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s6, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s5, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v6, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v3, v9, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v10, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v13, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v14, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v10, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v1, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB47_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB47_2
+; GFX11-TRUE16-NEXT: .LBB47_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v8f32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_4
+; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s7, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s6, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s5, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v11, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v9, v10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v6, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v14, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, v16, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v8, 16, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v12, 16, v9
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB47_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB47_2
+; GFX11-FAKE16-NEXT: .LBB47_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -17995,203 +18357,384 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v7, s23
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v16bf16_to_v4i64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s7, s19
-; GFX11-NEXT: s_mov_b32 s6, s18
-; GFX11-NEXT: s_mov_b32 s5, s17
-; GFX11-NEXT: s_mov_b32 s4, s16
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_mov_b32 s8, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB67_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT: s_cbranch_vccnz .LBB67_4
-; GFX11-NEXT: .LBB67_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s8, s7, 16
-; GFX11-NEXT: s_and_b32 s7, s7, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
-; GFX11-NEXT: s_and_b32 s8, s6, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_and_b32 s7, s5, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
-; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v11, v6
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v9, v10
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s3
-; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v6, v8
-; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v10
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v8
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v3, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s2
-; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v9
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_bfe_u32 v11, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_bfe_u32 v16, v14, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
-; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v0, v12, 16, v9
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB67_3:
-; GFX11-NEXT: s_branch .LBB67_2
-; GFX11-NEXT: .LBB67_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v4i64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB67_4
+; GFX11-TRUE16-NEXT: .LBB67_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s7, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s6, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s5, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v6, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v3, v9, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v10, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v13, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v14, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v10, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v1, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB67_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB67_2
+; GFX11-TRUE16-NEXT: .LBB67_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v4i64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB67_4
+; GFX11-FAKE16-NEXT: .LBB67_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s7, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s6, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s5, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v11, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v9, v10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v6, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v14, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, v16, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v8, 16, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v12, 16, v9
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB67_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB67_2
+; GFX11-FAKE16-NEXT: .LBB67_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -23982,203 +24525,384 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %
; GFX9-NEXT: v_mov_b32_e32 v7, s23
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v16bf16_to_v4f64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s7, s19
-; GFX11-NEXT: s_mov_b32 s6, s18
-; GFX11-NEXT: s_mov_b32 s5, s17
-; GFX11-NEXT: s_mov_b32 s4, s16
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_mov_b32 s8, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB83_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT: s_cbranch_vccnz .LBB83_4
-; GFX11-NEXT: .LBB83_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s8, s7, 16
-; GFX11-NEXT: s_and_b32 s7, s7, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
-; GFX11-NEXT: s_and_b32 s8, s6, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_and_b32 s7, s5, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
-; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v11, v6
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v9, v10
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s3
-; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v6, v8
-; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v10
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v8
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v3, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s2
-; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v9
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_bfe_u32 v11, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_bfe_u32 v16, v14, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
-; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v0, v12, 16, v9
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB83_3:
-; GFX11-NEXT: s_branch .LBB83_2
-; GFX11-NEXT: .LBB83_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v4f64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB83_4
+; GFX11-TRUE16-NEXT: .LBB83_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s7, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s6, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s5, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v6, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v3, v9, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v10, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v13, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v14, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v10, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v1, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB83_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB83_2
+; GFX11-TRUE16-NEXT: .LBB83_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v4f64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB83_4
+; GFX11-FAKE16-NEXT: .LBB83_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s7, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s6, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s5, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v11, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v9, v10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v6, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v14, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, v16, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v8, 16, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v12, 16, v9
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB83_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB83_2
+; GFX11-FAKE16-NEXT: .LBB83_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -28722,13 +29446,10 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v8, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v9, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v8, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
@@ -28736,142 +29457,128 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v9
; GFX11-TRUE16-NEXT: v_add3_u32 v14, v14, v9, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v8, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v11, v12 :: v_dual_and_b32 v1, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v10, 16, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v1, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v13, v15, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v8.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v14, v16, vcc_lo
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v4
; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v11, v13, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.h
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v14, v10, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v2, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v14, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v14, v15, v12, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v11, v13, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v11, v15, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v14
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v11, v18, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v11, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v16
-; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v18, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v13, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v15, v16, v13, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v14, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v16, v16, v14, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v21 :: v_dual_cndmask_b32 v11, v16, v19
-; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v14, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v14, v16, v13, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add3_u32 v14, v16, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v16, v17, v11, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v14, v15 :: v_dual_lshlrev_b32 v17, 16, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_add3_u32 v15, v19, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v13.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v16, v18 :: v_dual_lshlrev_b32 v18, 16, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v14, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v15, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v18
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: v_add3_u32 v16, v20, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v14, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v19, v21, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v19, v22, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v21, v23, v18, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT: v_add3_u32 v23, v24, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v21, v22, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v23, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v16, v17, v14, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add3_u32 v16, v16, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v17, v18, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v18, v19, v15, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v18, v19, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v19, v25, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v18
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v6.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v16, v20, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v15, v17, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v1, 16, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v2, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v1, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v11, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v12, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v17, v20, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v14.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v16, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
; GFX11-TRUE16-NEXT: .LBB94_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -29497,175 +30204,334 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a
; GFX9-NEXT: v_mov_b32_e32 v7, s23
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v16bf16_to_v16i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s7, s19
-; GFX11-NEXT: s_mov_b32 s6, s18
-; GFX11-NEXT: s_mov_b32 s5, s17
-; GFX11-NEXT: s_mov_b32 s4, s16
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_mov_b32 s8, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB95_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT: s_cbranch_vccnz .LBB95_4
-; GFX11-NEXT: .LBB95_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s8, s0, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT: s_and_b32 s8, s1, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s8
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: s_and_b32 s1, s2, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_lshl_b32 s0, s2, 16
-; GFX11-NEXT: s_and_b32 s1, s5, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v8, v6
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v5, v3, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v4, v7
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s3, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_lshl_b32 s0, s3, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v10, v4
-; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v3, v11, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_and_b32 s0, s4, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v12, v5
-; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s4, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v10, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v12, v7
-; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT: s_lshl_b32 s0, s5, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v14, v10
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10
-; GFX11-NEXT: v_bfe_u32 v15, v11, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v5, v12, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v15, v11
-; GFX11-NEXT: s_and_b32 s0, s6, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v16, v5, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v13, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v7, v17, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s6, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v5
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s7, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v11, v14, v18 :: v_dual_add_nc_u32 v14, 0x7fff, v16
-; GFX11-NEXT: v_bfe_u32 v16, v7, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v7
-; GFX11-NEXT: s_and_b32 s0, s7, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v22, v19, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v14, v18 :: v_dual_add_nc_u32 v14, v20, v17
-; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v20, v22, v19
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_bfe_u32 v18, v21, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v7
-; GFX11-NEXT: v_cndmask_b32_e32 v14, v14, v23, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v24, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_and_or_b32 v5, 0xffff0000, v11, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v19
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v3, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v6, v15, vcc_lo
-; GFX11-NEXT: v_and_or_b32 v6, 0xffff0000, v16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v1
-; GFX11-NEXT: v_and_or_b32 v7, 0xffff0000, v7, v17
-; GFX11-NEXT: v_and_or_b32 v4, 0xffff0000, v13, v10
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v8, v12
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v9, v14
-; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v15
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB95_3:
-; GFX11-NEXT: s_branch .LBB95_2
-; GFX11-NEXT: .LBB95_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v16i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB95_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB95_4
+; GFX11-TRUE16-NEXT: .LBB95_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s5, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v8.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v2, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v2, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s4, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v10.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v4 :: v_dual_add_nc_u32 v3, v5, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v12
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s4, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s5, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v15
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s6, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v12.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v5, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s6, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v13.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v6, v16 :: v_dual_add_nc_u32 v6, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s7, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s7, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v6, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v18, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v20, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v19, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v6, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v16, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v20, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v16, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v17.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB95_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB95_2
+; GFX11-TRUE16-NEXT: .LBB95_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v16i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB95_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB95_4
+; GFX11-FAKE16-NEXT: .LBB95_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s5, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v8, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v5, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v10, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v3, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s4, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v12, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s4, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v12, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s5, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v14, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v5, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v15, v11
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s6, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v7, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s6, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, v16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s7, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v14, v18 :: v_dual_add_nc_u32 v14, 0x7fff, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, v16, v7
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s7, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v14, v18 :: v_dual_add_nc_u32 v14, v20, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, v22, v19
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v21, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v18, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v20, v24, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, 0xffff0000, v11, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v3, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v6, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_or_b32 v6, 0xffff0000, v16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-FAKE16-NEXT: v_and_or_b32 v7, 0xffff0000, v7, v17
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0xffff0000, v13, v10
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v8, v12
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v9, v14
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v15
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB95_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB95_2
+; GFX11-FAKE16-NEXT: .LBB95_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -32462,177 +33328,351 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX9-NEXT: s_branch .LBB99_2
;
-; GFX11-LABEL: bitcast_v32i8_to_v16i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
-; GFX11-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v14, 8, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB99_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-NEXT: s_or_b32 s5, s5, s6
-; GFX11-NEXT: s_or_b32 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v15
-; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v16
-; GFX11-NEXT: s_or_b32 s9, s9, s10
-; GFX11-NEXT: v_or_b32_e32 v1, v1, v20
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v18
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v17
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-NEXT: v_or_b32_e32 v4, v4, v21
-; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v12
-; GFX11-NEXT: v_or_b32_e32 v2, v2, v19
-; GFX11-NEXT: v_or_b32_e32 v5, v5, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX11-NEXT: s_or_b32 s10, s11, s12
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e64 v3, 0xffff, s10
-; GFX11-NEXT: v_or_b32_e32 v7, v7, v13
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v5, v2, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, s6
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v14
-; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v7, v7, 16, v23
-; GFX11-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v3
-; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB99_3
-; GFX11-NEXT: .LBB99_2: ; %cmp.true
-; GFX11-NEXT: s_add_i32 s28, s28, 3
-; GFX11-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-NEXT: s_add_i32 s24, s24, 3
-; GFX11-NEXT: s_or_b32 s4, s5, s4
-; GFX11-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-NEXT: s_add_i32 s26, s26, 3
-; GFX11-NEXT: s_or_b32 s5, s6, s5
-; GFX11-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-NEXT: s_add_i32 s20, s20, 3
-; GFX11-NEXT: s_or_b32 s6, s7, s6
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_add_i32 s22, s22, 3
-; GFX11-NEXT: s_or_b32 s7, s8, s7
-; GFX11-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-NEXT: s_add_i32 s16, s16, 3
-; GFX11-NEXT: s_or_b32 s8, s9, s8
-; GFX11-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-NEXT: s_add_i32 s18, s18, 3
-; GFX11-NEXT: s_add_i32 s0, s0, 3
-; GFX11-NEXT: s_add_i32 s2, s2, 3
-; GFX11-NEXT: s_or_b32 s9, s10, s9
-; GFX11-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: s_or_b32 s10, s11, s10
-; GFX11-NEXT: s_or_b32 s0, s1, s0
-; GFX11-NEXT: s_or_b32 s1, s3, s2
-; GFX11-NEXT: s_addk_i32 s5, 0x300
-; GFX11-NEXT: s_addk_i32 s6, 0x300
-; GFX11-NEXT: s_addk_i32 s9, 0x300
-; GFX11-NEXT: s_addk_i32 s0, 0x300
-; GFX11-NEXT: s_addk_i32 s1, 0x300
-; GFX11-NEXT: s_addk_i32 s10, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v15
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v16
-; GFX11-NEXT: s_addk_i32 s7, 0x300
-; GFX11-NEXT: s_addk_i32 s8, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v12
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v18
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v17
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX11-NEXT: v_or_b32_e32 v4, v20, v4
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_or_b32_e32 v1, v13, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-NEXT: v_or_b32_e32 v3, v9, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-NEXT: v_or_b32_e32 v6, v14, v6
-; GFX11-NEXT: s_addk_i32 s4, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-NEXT: v_and_b32_e64 v7, 0xffff, s4
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: .LBB99_3: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB99_4:
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-NEXT: s_branch .LBB99_2
+; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s28, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s29, 8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v10
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v21
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB99_3
+; GFX11-TRUE16-NEXT: .LBB99_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
+; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v18
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v20, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT: .LBB99_3: ; %end
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB99_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-TRUE16-NEXT: s_branch .LBB99_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB99_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v16
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v7, 16, v23
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB99_3
+; GFX11-FAKE16-NEXT: .LBB99_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v16
+; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v17
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v21, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v13, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v9, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v14, v6
+; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v7, 0xffff, s4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: .LBB99_3: ; %end
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB99_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-FAKE16-NEXT: s_branch .LBB99_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -34430,192 +35470,369 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg %
; GFX9-NEXT: v_mov_b32_e32 v7, s23
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v16bf16_to_v16f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s7, s19
-; GFX11-NEXT: s_mov_b32 s6, s18
-; GFX11-NEXT: s_mov_b32 s5, s17
-; GFX11-NEXT: s_mov_b32 s4, s16
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_mov_b32 s8, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB103_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT: s_cbranch_vccnz .LBB103_4
-; GFX11-NEXT: .LBB103_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s8, s0, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s1, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s0, s2, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: s_and_b32 s8, s2, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s4, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s8
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v9, v5
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v6, v7, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s3, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_add_nc_u32 v4, v6, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_lshl_b32 s0, s3, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s4, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v4
-; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v9 :: v_dual_add_nc_u32 v5, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_and_b32 s0, s5, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v7
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_lshl_b32 s0, s5, 16
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v5
-; GFX11-NEXT: v_bfe_u32 v5, v12, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v12
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_bfe_u32 v13, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v12
-; GFX11-NEXT: s_and_b32 s0, s6, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v14, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v14, v15, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s6, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v6
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v10 :: v_dual_add_nc_u32 v10, v14, v15
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s7, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v18, v14, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: s_and_b32 s0, s7, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v13, v16, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v15
-; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v18, v14
-; GFX11-NEXT: v_bfe_u32 v13, v12, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v20, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v10, v10, v16 :: v_dual_add_nc_u32 v15, 0x7fff, v15
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v12
-; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v17
-; GFX11-NEXT: v_cndmask_b32_e32 v14, v15, v21, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v22, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v16
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v20, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v17
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_lshl_or_b32 v6, v12, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v7, v13, 16, v15
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v4, v17, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v3, v9, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v2, v11, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v1, v8, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v14
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB103_3:
-; GFX11-NEXT: s_branch .LBB103_2
-; GFX11-NEXT: .LBB103_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v16f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB103_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB103_4
+; GFX11-TRUE16-NEXT: .LBB103_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s5, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v9, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s7, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v6, v2 :: v_dual_add_nc_u32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v18, v17
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s4, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v2, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s4, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v6, v7 :: v_dual_add_nc_u32 v4, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s5, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v7, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v14.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s6, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v9.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s6, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v15, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s7, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v13.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v12, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v20, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v8.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v20, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v12, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v10.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v15.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB103_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB103_2
+; GFX11-TRUE16-NEXT: .LBB103_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v16f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB103_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB103_4
+; GFX11-FAKE16-NEXT: .LBB103_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s4, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v9, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_add_nc_u32 v4, v6, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s4, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v5, v9 :: v_dual_add_nc_u32 v5, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s5, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s5, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v12
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s6, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v15, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s6, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v10 :: v_dual_add_nc_u32 v10, v14, v15
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s7, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v14, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s7, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v15
+; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v18, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v20, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v16 :: v_dual_add_nc_u32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v15, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, v16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v18, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v12, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v13, 16, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v17, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v9, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v11, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v8, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v14
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB103_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB103_2
+; GFX11-FAKE16-NEXT: .LBB103_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -37354,177 +38571,351 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX9-NEXT: s_branch .LBB107_2
;
-; GFX11-LABEL: bitcast_v32i8_to_v16f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
-; GFX11-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v14, 8, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB107_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-NEXT: s_or_b32 s5, s5, s6
-; GFX11-NEXT: s_or_b32 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v15
-; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v16
-; GFX11-NEXT: s_or_b32 s9, s9, s10
-; GFX11-NEXT: v_or_b32_e32 v1, v1, v20
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v18
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v17
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-NEXT: v_or_b32_e32 v4, v4, v21
-; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v12
-; GFX11-NEXT: v_or_b32_e32 v2, v2, v19
-; GFX11-NEXT: v_or_b32_e32 v5, v5, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX11-NEXT: s_or_b32 s10, s11, s12
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e64 v3, 0xffff, s10
-; GFX11-NEXT: v_or_b32_e32 v7, v7, v13
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v5, v2, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, s6
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v14
-; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v7, v7, 16, v23
-; GFX11-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v3
-; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB107_3
-; GFX11-NEXT: .LBB107_2: ; %cmp.true
-; GFX11-NEXT: s_add_i32 s28, s28, 3
-; GFX11-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-NEXT: s_add_i32 s24, s24, 3
-; GFX11-NEXT: s_or_b32 s4, s5, s4
-; GFX11-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-NEXT: s_add_i32 s26, s26, 3
-; GFX11-NEXT: s_or_b32 s5, s6, s5
-; GFX11-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-NEXT: s_add_i32 s20, s20, 3
-; GFX11-NEXT: s_or_b32 s6, s7, s6
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_add_i32 s22, s22, 3
-; GFX11-NEXT: s_or_b32 s7, s8, s7
-; GFX11-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-NEXT: s_add_i32 s16, s16, 3
-; GFX11-NEXT: s_or_b32 s8, s9, s8
-; GFX11-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-NEXT: s_add_i32 s18, s18, 3
-; GFX11-NEXT: s_add_i32 s0, s0, 3
-; GFX11-NEXT: s_add_i32 s2, s2, 3
-; GFX11-NEXT: s_or_b32 s9, s10, s9
-; GFX11-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: s_or_b32 s10, s11, s10
-; GFX11-NEXT: s_or_b32 s0, s1, s0
-; GFX11-NEXT: s_or_b32 s1, s3, s2
-; GFX11-NEXT: s_addk_i32 s5, 0x300
-; GFX11-NEXT: s_addk_i32 s6, 0x300
-; GFX11-NEXT: s_addk_i32 s9, 0x300
-; GFX11-NEXT: s_addk_i32 s0, 0x300
-; GFX11-NEXT: s_addk_i32 s1, 0x300
-; GFX11-NEXT: s_addk_i32 s10, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v15
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v16
-; GFX11-NEXT: s_addk_i32 s7, 0x300
-; GFX11-NEXT: s_addk_i32 s8, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v12
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v18
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v17
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX11-NEXT: v_or_b32_e32 v4, v20, v4
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_or_b32_e32 v1, v13, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-NEXT: v_or_b32_e32 v3, v9, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-NEXT: v_or_b32_e32 v6, v14, v6
-; GFX11-NEXT: s_addk_i32 s4, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-NEXT: v_and_b32_e64 v7, 0xffff, s4
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: .LBB107_3: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB107_4:
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-NEXT: s_branch .LBB107_2
+; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB107_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s28, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s29, 8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v10
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v21
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB107_3
+; GFX11-TRUE16-NEXT: .LBB107_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
+; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v18
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v20, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT: .LBB107_3: ; %end
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB107_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-TRUE16-NEXT: s_branch .LBB107_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB107_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v16
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v7, 16, v23
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB107_3
+; GFX11-FAKE16-NEXT: .LBB107_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v16
+; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v17
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v21, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v13, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v9, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v14, v6
+; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v7, 0xffff, s4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: .LBB107_3: ; %end
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB107_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-FAKE16-NEXT: s_branch .LBB107_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -39446,281 +40837,552 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v28, v32
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v16bf16_to_v32i8_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_mov_b32 s12, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB109_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s27, s19, 24
-; GFX11-NEXT: s_lshr_b32 s46, s19, 16
-; GFX11-NEXT: s_lshr_b32 s40, s19, 8
-; GFX11-NEXT: s_lshr_b32 s42, s18, 16
-; GFX11-NEXT: s_lshr_b32 s41, s18, 8
-; GFX11-NEXT: s_lshr_b32 s23, s17, 24
-; GFX11-NEXT: s_lshr_b32 s45, s17, 16
-; GFX11-NEXT: s_lshr_b32 s26, s17, 8
-; GFX11-NEXT: s_lshr_b32 s29, s16, 16
-; GFX11-NEXT: s_lshr_b32 s28, s16, 8
-; GFX11-NEXT: s_lshr_b32 s15, s3, 24
-; GFX11-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-NEXT: s_lshr_b32 s22, s3, 8
-; GFX11-NEXT: s_lshr_b32 s25, s2, 16
-; GFX11-NEXT: s_lshr_b32 s24, s2, 8
-; GFX11-NEXT: s_lshr_b32 s13, s1, 24
-; GFX11-NEXT: s_lshr_b32 s43, s1, 16
-; GFX11-NEXT: s_lshr_b32 s14, s1, 8
-; GFX11-NEXT: s_lshr_b32 s21, s0, 16
-; GFX11-NEXT: s_lshr_b32 s20, s0, 8
-; GFX11-NEXT: s_lshr_b64 s[10:11], s[18:19], 24
-; GFX11-NEXT: s_lshr_b64 s[8:9], s[16:17], 24
-; GFX11-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12
-; GFX11-NEXT: s_cbranch_vccnz .LBB109_4
-; GFX11-NEXT: .LBB109_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s4, s1, 16
-; GFX11-NEXT: s_and_b32 s1, s1, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: s_and_b32 s4, s0, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: s_and_b32 s1, s3, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_and_b32 s0, s2, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v4, v10, v3
-; GFX11-NEXT: v_bfe_u32 v10, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v35
-; GFX11-NEXT: v_bfe_u32 v9, v8, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v0, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v4, v11 :: v_dual_add_nc_u32 v12, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v10, v7
-; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v9, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s2, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v1, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s17, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v5
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v9
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v4
-; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v10, v14, 16, v7
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v13, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_lshl_b32 s0, s16, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_nc_u32 v7, v12, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v13, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v13, v15, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v15
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v15
-; GFX11-NEXT: v_bfe_u32 v13, v17, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v33
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v13, v17
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s18, 16
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_bfe_u32 v18, v13, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v12, v4, 16, 1
-; GFX11-NEXT: v_bfe_u32 v20, v16, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v18, v13
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v20, v16
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v4
-; GFX11-NEXT: v_bfe_u32 v17, v19, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v16
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v15, v21, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v18, v23, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v17
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v12, v20, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v18, v22, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v15, v17, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v32
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 24, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v12
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v26, v30, 16, v4
-; GFX11-NEXT: v_lshl_or_b32 v17, v5, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v25, v7, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v26
-; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[17:18]
-; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[25:26]
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-NEXT: s_branch .LBB109_5
-; GFX11-NEXT: .LBB109_3:
-; GFX11-NEXT: ; implicit-def: $sgpr20
-; GFX11-NEXT: ; implicit-def: $sgpr21
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr14
-; GFX11-NEXT: ; implicit-def: $sgpr43
-; GFX11-NEXT: ; implicit-def: $sgpr13
-; GFX11-NEXT: ; implicit-def: $sgpr24
-; GFX11-NEXT: ; implicit-def: $sgpr25
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: ; implicit-def: $sgpr22
-; GFX11-NEXT: ; implicit-def: $sgpr44
-; GFX11-NEXT: ; implicit-def: $sgpr15
-; GFX11-NEXT: ; implicit-def: $sgpr28
-; GFX11-NEXT: ; implicit-def: $sgpr29
-; GFX11-NEXT: ; implicit-def: $sgpr8
-; GFX11-NEXT: ; implicit-def: $sgpr26
-; GFX11-NEXT: ; implicit-def: $sgpr45
-; GFX11-NEXT: ; implicit-def: $sgpr23
-; GFX11-NEXT: ; implicit-def: $sgpr41
-; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: ; implicit-def: $sgpr10
-; GFX11-NEXT: ; implicit-def: $sgpr40
-; GFX11-NEXT: ; implicit-def: $sgpr46
-; GFX11-NEXT: ; implicit-def: $sgpr27
-; GFX11-NEXT: s_branch .LBB109_2
-; GFX11-NEXT: .LBB109_4:
-; GFX11-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v33, s17
-; GFX11-NEXT: v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v35, s1
-; GFX11-NEXT: v_dual_mov_b32 v30, s46 :: v_dual_mov_b32 v25, s41
-; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v31, s27
-; GFX11-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v29, s40
-; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s28
-; GFX11-NEXT: v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v23, s23
-; GFX11-NEXT: v_dual_mov_b32 v14, s44 :: v_dual_mov_b32 v21, s26
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v9, s24
-; GFX11-NEXT: v_dual_mov_b32 v6, s43 :: v_dual_mov_b32 v15, s15
-; GFX11-NEXT: v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v13, s22
-; GFX11-NEXT: v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v1, s20
-; GFX11-NEXT: v_dual_mov_b32 v10, s25 :: v_dual_mov_b32 v7, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s21 :: v_dual_mov_b32 v5, s14
-; GFX11-NEXT: v_mov_b32_e32 v27, s10
-; GFX11-NEXT: v_mov_b32_e32 v19, s8
-; GFX11-NEXT: v_mov_b32_e32 v11, s6
-; GFX11-NEXT: v_mov_b32_e32 v3, s4
-; GFX11-NEXT: .LBB109_5: ; %end
-; GFX11-NEXT: v_mov_b32_e32 v4, v35
-; GFX11-NEXT: v_mov_b32_e32 v12, v34
-; GFX11-NEXT: v_mov_b32_e32 v20, v33
-; GFX11-NEXT: v_mov_b32_e32 v28, v32
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v32i8_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s19, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s19, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s18, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s17, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s17, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s16, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s3, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s3, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s2, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB109_4
+; GFX11-TRUE16-NEXT: .LBB109_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s1, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s18, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v10, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v19, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, v20, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v4
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v3, v9 :: v_dual_add_nc_u32 v4, v5, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v34.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_add_nc_u32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v11.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v8, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v7, v12 :: v_dual_add_nc_u32 v12, 0x7fff, v15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v22.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v15, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v13, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v12, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v16, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v12, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v13, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v33.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v30.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v20, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v32.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v5.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[25:26]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT: s_branch .LBB109_5
+; GFX11-TRUE16-NEXT: .LBB109_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr20
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr21
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27
+; GFX11-TRUE16-NEXT: s_branch .LBB109_2
+; GFX11-TRUE16-NEXT: .LBB109_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v33, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v35, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s46 :: v_dual_mov_b32 v25, s41
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v31, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v29, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v23, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s44 :: v_dual_mov_b32 v21, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v9, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s43 :: v_dual_mov_b32 v15, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v13, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v1, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s25 :: v_dual_mov_b32 v7, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s21 :: v_dual_mov_b32 v5, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v27, s10
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v19, s8
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT: .LBB109_5: ; %end
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v35
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, v34
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v20, v33
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v32
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v32i8_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s19, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s18, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s17, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s16, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s3, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s3, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s2, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB109_4
+; GFX11-FAKE16-NEXT: .LBB109_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s1, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v4, v10, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v35
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v4, v11 :: v_dual_add_nc_u32 v12, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v10, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v9, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v14, 16, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_nc_u32 v7, v12, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v13, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v15, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v15
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v33
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v13, v17
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v13, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v18, v13
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v20, v16
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v15, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v18, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v12, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v22, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v15, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 24, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v30, 16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v5, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v7, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[17:18]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[25:26]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT: s_branch .LBB109_5
+; GFX11-FAKE16-NEXT: .LBB109_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr20
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27
+; GFX11-FAKE16-NEXT: s_branch .LBB109_2
+; GFX11-FAKE16-NEXT: .LBB109_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v33, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v35, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s46 :: v_dual_mov_b32 v25, s41
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v31, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v29, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s28
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v23, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s44 :: v_dual_mov_b32 v21, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v9, s24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s43 :: v_dual_mov_b32 v15, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v13, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v1, s20
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s25 :: v_dual_mov_b32 v7, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s21 :: v_dual_mov_b32 v5, s14
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v27, s10
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v19, s8
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT: .LBB109_5: ; %end
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v35
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v34
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, v33
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v32
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -41235,177 +42897,351 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX9-NEXT: s_branch .LBB111_2
;
-; GFX11-LABEL: bitcast_v32i8_to_v16bf16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
-; GFX11-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v14, 8, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB111_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-NEXT: s_or_b32 s5, s5, s6
-; GFX11-NEXT: s_or_b32 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v15
-; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v16
-; GFX11-NEXT: s_or_b32 s9, s9, s10
-; GFX11-NEXT: v_or_b32_e32 v1, v1, v20
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v18
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v17
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-NEXT: v_or_b32_e32 v4, v4, v21
-; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v12
-; GFX11-NEXT: v_or_b32_e32 v2, v2, v19
-; GFX11-NEXT: v_or_b32_e32 v5, v5, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX11-NEXT: s_or_b32 s10, s11, s12
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e64 v3, 0xffff, s10
-; GFX11-NEXT: v_or_b32_e32 v7, v7, v13
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v5, v2, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, s6
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v14
-; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v7, v7, 16, v23
-; GFX11-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v3
-; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB111_3
-; GFX11-NEXT: .LBB111_2: ; %cmp.true
-; GFX11-NEXT: s_add_i32 s28, s28, 3
-; GFX11-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-NEXT: s_add_i32 s24, s24, 3
-; GFX11-NEXT: s_or_b32 s4, s5, s4
-; GFX11-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-NEXT: s_add_i32 s26, s26, 3
-; GFX11-NEXT: s_or_b32 s5, s6, s5
-; GFX11-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-NEXT: s_add_i32 s20, s20, 3
-; GFX11-NEXT: s_or_b32 s6, s7, s6
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_add_i32 s22, s22, 3
-; GFX11-NEXT: s_or_b32 s7, s8, s7
-; GFX11-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-NEXT: s_add_i32 s16, s16, 3
-; GFX11-NEXT: s_or_b32 s8, s9, s8
-; GFX11-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-NEXT: s_add_i32 s18, s18, 3
-; GFX11-NEXT: s_add_i32 s0, s0, 3
-; GFX11-NEXT: s_add_i32 s2, s2, 3
-; GFX11-NEXT: s_or_b32 s9, s10, s9
-; GFX11-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: s_or_b32 s10, s11, s10
-; GFX11-NEXT: s_or_b32 s0, s1, s0
-; GFX11-NEXT: s_or_b32 s1, s3, s2
-; GFX11-NEXT: s_addk_i32 s5, 0x300
-; GFX11-NEXT: s_addk_i32 s6, 0x300
-; GFX11-NEXT: s_addk_i32 s9, 0x300
-; GFX11-NEXT: s_addk_i32 s0, 0x300
-; GFX11-NEXT: s_addk_i32 s1, 0x300
-; GFX11-NEXT: s_addk_i32 s10, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v15
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v16
-; GFX11-NEXT: s_addk_i32 s7, 0x300
-; GFX11-NEXT: s_addk_i32 s8, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v12
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v18
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v17
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX11-NEXT: v_or_b32_e32 v4, v20, v4
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_or_b32_e32 v1, v13, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-NEXT: v_or_b32_e32 v3, v9, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-NEXT: v_or_b32_e32 v6, v14, v6
-; GFX11-NEXT: s_addk_i32 s4, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-NEXT: v_and_b32_e64 v7, 0xffff, s4
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: .LBB111_3: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB111_4:
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-NEXT: s_branch .LBB111_2
+; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16bf16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB111_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s28, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s29, 8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v10
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v21
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB111_3
+; GFX11-TRUE16-NEXT: .LBB111_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
+; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v18
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v20, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT: .LBB111_3: ; %end
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB111_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-TRUE16-NEXT: s_branch .LBB111_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16bf16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB111_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v16
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v7, 16, v23
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB111_3
+; GFX11-FAKE16-NEXT: .LBB111_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v16
+; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v17
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v21, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v13, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v9, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v14, v6
+; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v7, 0xffff, s4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: .LBB111_3: ; %end
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB111_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-FAKE16-NEXT: s_branch .LBB111_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index b622e6e..9041f64 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -19562,212 +19562,421 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_branch .LBB51_2
;
-; GFX11-LABEL: bitcast_v40i8_to_v20i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
-; GFX11-NEXT: v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
-; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
-; GFX11-NEXT: v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v31, 8, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 8, v13
-; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v19
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v21
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB51_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-NEXT: s_or_b32 s5, s5, s6
-; GFX11-NEXT: s_or_b32 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v23
-; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-NEXT: s_or_b32 s9, s9, s10
-; GFX11-NEXT: s_or_b32 s10, s11, s12
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v22
-; GFX11-NEXT: v_and_b32_e64 v2, 0xffff, s10
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v24
-; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v28
-; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v26
-; GFX11-NEXT: v_or_b32_e32 v2, v3, v31
-; GFX11-NEXT: v_or_b32_e32 v3, v5, v34
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v29
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v27
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v33
-; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v25
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v18
-; GFX11-NEXT: v_or_b32_e32 v9, v6, v35
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v16
-; GFX11-NEXT: v_or_b32_e32 v5, v5, v36
-; GFX11-NEXT: v_or_b32_e32 v7, v7, v37
-; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v20
-; GFX11-NEXT: v_or_b32_e32 v8, v8, v19
-; GFX11-NEXT: v_or_b32_e32 v12, v6, v17
-; GFX11-NEXT: v_lshl_or_b32 v6, v0, 16, v3
-; GFX11-NEXT: v_mov_b32_e32 v0, s5
-; GFX11-NEXT: v_or_b32_e32 v1, v1, v32
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-NEXT: v_or_b32_e32 v10, v10, v21
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v7, v9, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v8, v12, 16, v13
-; GFX11-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-NEXT: v_lshl_or_b32 v9, v10, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v5, v2, 16, v1
-; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB51_3
-; GFX11-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v27
-; GFX11-NEXT: s_add_i32 s28, s28, 3
-; GFX11-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-NEXT: s_add_i32 s24, s24, 3
-; GFX11-NEXT: s_or_b32 s4, s5, s4
-; GFX11-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-NEXT: s_add_i32 s26, s26, 3
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT: s_or_b32 s5, s6, s5
-; GFX11-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-NEXT: s_add_i32 s20, s20, 3
-; GFX11-NEXT: s_or_b32 s6, s7, s6
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_add_i32 s22, s22, 3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v28
-; GFX11-NEXT: v_or_b32_e32 v4, v36, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v26
-; GFX11-NEXT: s_or_b32 s7, s8, s7
-; GFX11-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-NEXT: s_add_i32 s16, s16, 3
-; GFX11-NEXT: s_or_b32 s8, s9, s8
-; GFX11-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-NEXT: s_add_i32 s18, s18, 3
-; GFX11-NEXT: s_add_i32 s0, s0, 3
-; GFX11-NEXT: s_add_i32 s2, s2, 3
-; GFX11-NEXT: s_or_b32 s9, s10, s9
-; GFX11-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x300, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-NEXT: s_or_b32 s10, s11, s10
-; GFX11-NEXT: s_or_b32 s0, s1, s0
-; GFX11-NEXT: s_or_b32 s1, s3, s2
-; GFX11-NEXT: s_addk_i32 s5, 0x300
-; GFX11-NEXT: s_addk_i32 s6, 0x300
-; GFX11-NEXT: s_addk_i32 s9, 0x300
-; GFX11-NEXT: s_addk_i32 s10, 0x300
-; GFX11-NEXT: s_addk_i32 s0, 0x300
-; GFX11-NEXT: s_addk_i32 s1, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v29
-; GFX11-NEXT: v_or_b32_e32 v5, v35, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_or_b32_e32 v4, v33, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v23
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v18
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v25
-; GFX11-NEXT: s_addk_i32 s7, 0x300
-; GFX11-NEXT: s_addk_i32 s8, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v20
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x300, v5
-; GFX11-NEXT: v_or_b32_e32 v5, v34, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v16
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v19, v0
-; GFX11-NEXT: v_or_b32_e32 v2, v37, v2
-; GFX11-NEXT: v_or_b32_e32 v7, v32, v7
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_or_b32_e32 v4, v22, v4
-; GFX11-NEXT: s_addk_i32 s4, 0x300
-; GFX11-NEXT: v_or_b32_e32 v1, v21, v1
-; GFX11-NEXT: v_or_b32_e32 v3, v17, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-NEXT: v_or_b32_e32 v6, v31, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-NEXT: v_and_b32_e64 v10, 0xffff, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v5, v6, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v6, v11, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v7, v8, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v8, v3, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: .LBB51_3: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB51_4:
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT: s_branch .LBB51_2
+; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v20i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s23, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v30
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v27
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v1, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v20
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v28
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v35
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v3, v37
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3
+; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v25
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v37, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v36, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v33, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v10
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v18
+; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v34, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v19, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v32, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v35, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-TRUE16-NEXT: .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB51_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT: s_branch .LBB51_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v20i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v23
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v2, 0xffff, s10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v24
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v6, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v6, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v0, 16, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v32
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v9, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v12, 16, v13
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3
+; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v27
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v36, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v26
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v29
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v35, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v33, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v23
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v18
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v25
+; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v34, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v19, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v37, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v32, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v22, v4
+; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v31, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v10, 0xffff, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v11, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: .LBB51_3: ; %end
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB51_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT: s_branch .LBB51_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -25600,212 +25809,421 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_branch .LBB63_2
;
-; GFX11-LABEL: bitcast_v40i8_to_v20f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
-; GFX11-NEXT: v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
-; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
-; GFX11-NEXT: v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v31, 8, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 8, v13
-; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v19
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v21
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB63_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-NEXT: s_or_b32 s5, s5, s6
-; GFX11-NEXT: s_or_b32 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v23
-; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-NEXT: s_or_b32 s9, s9, s10
-; GFX11-NEXT: s_or_b32 s10, s11, s12
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v22
-; GFX11-NEXT: v_and_b32_e64 v2, 0xffff, s10
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v24
-; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v28
-; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v26
-; GFX11-NEXT: v_or_b32_e32 v2, v3, v31
-; GFX11-NEXT: v_or_b32_e32 v3, v5, v34
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v29
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v27
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v33
-; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v25
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v18
-; GFX11-NEXT: v_or_b32_e32 v9, v6, v35
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v16
-; GFX11-NEXT: v_or_b32_e32 v5, v5, v36
-; GFX11-NEXT: v_or_b32_e32 v7, v7, v37
-; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v20
-; GFX11-NEXT: v_or_b32_e32 v8, v8, v19
-; GFX11-NEXT: v_or_b32_e32 v12, v6, v17
-; GFX11-NEXT: v_lshl_or_b32 v6, v0, 16, v3
-; GFX11-NEXT: v_mov_b32_e32 v0, s5
-; GFX11-NEXT: v_or_b32_e32 v1, v1, v32
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-NEXT: v_or_b32_e32 v10, v10, v21
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v7, v9, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v8, v12, 16, v13
-; GFX11-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-NEXT: v_lshl_or_b32 v9, v10, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v5, v2, 16, v1
-; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB63_3
-; GFX11-NEXT: .LBB63_2: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v27
-; GFX11-NEXT: s_add_i32 s28, s28, 3
-; GFX11-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-NEXT: s_add_i32 s24, s24, 3
-; GFX11-NEXT: s_or_b32 s4, s5, s4
-; GFX11-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-NEXT: s_add_i32 s26, s26, 3
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT: s_or_b32 s5, s6, s5
-; GFX11-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-NEXT: s_add_i32 s20, s20, 3
-; GFX11-NEXT: s_or_b32 s6, s7, s6
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_add_i32 s22, s22, 3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v28
-; GFX11-NEXT: v_or_b32_e32 v4, v36, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v26
-; GFX11-NEXT: s_or_b32 s7, s8, s7
-; GFX11-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-NEXT: s_add_i32 s16, s16, 3
-; GFX11-NEXT: s_or_b32 s8, s9, s8
-; GFX11-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-NEXT: s_add_i32 s18, s18, 3
-; GFX11-NEXT: s_add_i32 s0, s0, 3
-; GFX11-NEXT: s_add_i32 s2, s2, 3
-; GFX11-NEXT: s_or_b32 s9, s10, s9
-; GFX11-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x300, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-NEXT: s_or_b32 s10, s11, s10
-; GFX11-NEXT: s_or_b32 s0, s1, s0
-; GFX11-NEXT: s_or_b32 s1, s3, s2
-; GFX11-NEXT: s_addk_i32 s5, 0x300
-; GFX11-NEXT: s_addk_i32 s6, 0x300
-; GFX11-NEXT: s_addk_i32 s9, 0x300
-; GFX11-NEXT: s_addk_i32 s10, 0x300
-; GFX11-NEXT: s_addk_i32 s0, 0x300
-; GFX11-NEXT: s_addk_i32 s1, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v29
-; GFX11-NEXT: v_or_b32_e32 v5, v35, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_or_b32_e32 v4, v33, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v23
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v18
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v25
-; GFX11-NEXT: s_addk_i32 s7, 0x300
-; GFX11-NEXT: s_addk_i32 s8, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v20
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x300, v5
-; GFX11-NEXT: v_or_b32_e32 v5, v34, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v16
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v19, v0
-; GFX11-NEXT: v_or_b32_e32 v2, v37, v2
-; GFX11-NEXT: v_or_b32_e32 v7, v32, v7
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_or_b32_e32 v4, v22, v4
-; GFX11-NEXT: s_addk_i32 s4, 0x300
-; GFX11-NEXT: v_or_b32_e32 v1, v21, v1
-; GFX11-NEXT: v_or_b32_e32 v3, v17, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-NEXT: v_or_b32_e32 v6, v31, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-NEXT: v_and_b32_e64 v10, 0xffff, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v5, v6, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v6, v11, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v7, v8, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v8, v3, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: .LBB63_3: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB63_4:
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT: s_branch .LBB63_2
+; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v20f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB63_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s23, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v30
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v27
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v1, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v20
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v28
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v35
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v3, v37
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB63_3
+; GFX11-TRUE16-NEXT: .LBB63_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v25
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v37, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v36, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v33, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v10
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v18
+; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v34, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v19, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v32, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v35, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-TRUE16-NEXT: .LBB63_3: ; %end
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB63_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT: s_branch .LBB63_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v20f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB63_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v23
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v2, 0xffff, s10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v24
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v6, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v6, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v0, 16, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v32
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v9, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v12, 16, v13
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB63_3
+; GFX11-FAKE16-NEXT: .LBB63_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v27
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v36, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v26
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v29
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v35, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v33, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v23
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v18
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v25
+; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v34, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v19, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v37, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v32, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v22, v4
+; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v31, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v10, 0xffff, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v11, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: .LBB63_3: ; %end
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB63_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT: s_branch .LBB63_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
index e6c7b1a..73b57a5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
@@ -1482,46 +1482,87 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg
; GFX9-NEXT: v_mov_b32_e32 v0, s16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v2bf16_to_i32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB15_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_vccnz .LBB15_4
-; GFX11-NEXT: .LBB15_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB15_3:
-; GFX11-NEXT: s_branch .LBB15_2
-; GFX11-NEXT: .LBB15_4:
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_i32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_4
+; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB15_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB15_2
+; GFX11-TRUE16-NEXT: .LBB15_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_i32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_4
+; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB15_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB15_2
+; GFX11-FAKE16-NEXT: .LBB15_4:
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -3720,46 +3761,87 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre
; GFX9-NEXT: v_mov_b32_e32 v0, s16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v2bf16_to_f32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB35_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_vccnz .LBB35_4
-; GFX11-NEXT: .LBB35_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB35_3:
-; GFX11-NEXT: s_branch .LBB35_2
-; GFX11-NEXT: .LBB35_4:
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_f32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_4
+; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB35_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB35_2
+; GFX11-TRUE16-NEXT: .LBB35_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_f32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_4
+; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB35_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB35_2
+; GFX11-FAKE16-NEXT: .LBB35_4:
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -5424,27 +5506,24 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: .LBB50_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -5592,44 +5671,81 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v0, s16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v2bf16_to_v2i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB51_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_vccnz .LBB51_4
-; GFX11-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB51_3:
-; GFX11-NEXT: s_branch .LBB51_2
-; GFX11-NEXT: .LBB51_4:
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v2i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_4
+; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_add_nc_u32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB51_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB51_2
+; GFX11-TRUE16-NEXT: .LBB51_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v2i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_4
+; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB51_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB51_2
+; GFX11-FAKE16-NEXT: .LBB51_4:
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -7223,46 +7339,87 @@ define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v0, s16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v2bf16_to_v2f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB63_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_vccnz .LBB63_4
-; GFX11-NEXT: .LBB63_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB63_3:
-; GFX11-NEXT: s_branch .LBB63_2
-; GFX11-NEXT: .LBB63_4:
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v2f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB63_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB63_4
+; GFX11-TRUE16-NEXT: .LBB63_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB63_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB63_2
+; GFX11-TRUE16-NEXT: .LBB63_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v2f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB63_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB63_4
+; GFX11-FAKE16-NEXT: .LBB63_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB63_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB63_2
+; GFX11-FAKE16-NEXT: .LBB63_4:
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -8653,46 +8810,87 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v0, s16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v2bf16_to_v1i32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB73_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_vccnz .LBB73_4
-; GFX11-NEXT: .LBB73_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB73_3:
-; GFX11-NEXT: s_branch .LBB73_2
-; GFX11-NEXT: .LBB73_4:
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v1i32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB73_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB73_4
+; GFX11-TRUE16-NEXT: .LBB73_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB73_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB73_2
+; GFX11-TRUE16-NEXT: .LBB73_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v1i32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB73_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB73_4
+; GFX11-FAKE16-NEXT: .LBB73_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB73_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB73_2
+; GFX11-FAKE16-NEXT: .LBB73_4:
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -9258,57 +9456,109 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v2bf16_to_v4i8_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB77_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s2, s0, 24
-; GFX11-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_vccnz .LBB77_4
-; GFX11-NEXT: .LBB77_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB77_3:
-; GFX11-NEXT: ; implicit-def: $sgpr3
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr2
-; GFX11-NEXT: s_branch .LBB77_2
-; GFX11-NEXT: .LBB77_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2
-; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v4i8_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB77_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 8
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB77_4
+; GFX11-TRUE16-NEXT: .LBB77_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB77_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2
+; GFX11-TRUE16-NEXT: s_branch .LBB77_2
+; GFX11-TRUE16-NEXT: .LBB77_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v4i8_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB77_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB77_4
+; GFX11-FAKE16-NEXT: .LBB77_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB77_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2
+; GFX11-FAKE16-NEXT: s_branch .LBB77_2
+; GFX11-FAKE16-NEXT: .LBB77_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
index acc0247..d5d2d4aa 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
@@ -374,59 +374,112 @@ define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v3bf16_to_v3f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB1_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_vccnz .LBB1_4
-; GFX11-NEXT: .LBB1_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s2, s0, 16
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v8 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB1_3:
-; GFX11-NEXT: s_branch .LBB1_2
-; GFX11-NEXT: .LBB1_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v3bf16_to_v3f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB1_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB1_4
+; GFX11-TRUE16-NEXT: .LBB1_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB1_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB1_2
+; GFX11-TRUE16-NEXT: .LBB1_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v3bf16_to_v3f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB1_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB1_4
+; GFX11-FAKE16-NEXT: .LBB1_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s0, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v5, v8 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB1_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB1_2
+; GFX11-FAKE16-NEXT: .LBB1_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -803,38 +856,36 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0x7fc0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: .LBB4_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -1025,56 +1076,105 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v3bf16_to_v3i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB5_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_vccnz .LBB5_4
-; GFX11-NEXT: .LBB5_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s2, s0, 16
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v2, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB5_3:
-; GFX11-NEXT: s_branch .LBB5_2
-; GFX11-NEXT: .LBB5_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v3bf16_to_v3i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB5_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB5_4
+; GFX11-TRUE16-NEXT: .LBB5_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s0, 16
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v3, v7 :: v_dual_add_nc_u32 v4, v4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB5_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB5_2
+; GFX11-TRUE16-NEXT: .LBB5_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v3bf16_to_v3i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB5_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB5_4
+; GFX11-FAKE16-NEXT: .LBB5_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s0, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v2, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB5_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB5_2
+; GFX11-FAKE16-NEXT: .LBB5_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index bff054f..ee23420 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -7351,360 +7351,696 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v32bf16_to_v16i32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s15, s3
-; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_mov_b32 s12, s0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB23_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB23_4
-; GFX11-NEXT: .LBB23_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s0, s27, 16
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s2, s26, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_and_b32 s1, s25, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s3, s25, 16
-; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s24, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s23, 16
-; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_lshl_b32 s1, s22, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT: s_lshl_b32 s1, s21, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s20, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v7
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v16, v4, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v4
-; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s1, s18, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v0
-; GFX11-NEXT: s_and_b32 s1, s17, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v6, v4
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v5
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v17, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v19, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_lshl_b32 s1, s16, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s1
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v18, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v16, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v17, v18
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v6, v16
-; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v16
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_lshl_b32 s1, s15, 16
-; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v3, v17, 16, 1
-; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: s_lshl_b32 s0, s14, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v17
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s13, 16
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_lshl_b32 s0, s12, 16
-; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v21
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v19, v21, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v23, v17, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v18
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v17
-; GFX11-NEXT: v_bfe_u32 v24, v22, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24
-; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v18, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v2, v16, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v0, v20, 16, v17
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB23_3:
-; GFX11-NEXT: s_branch .LBB23_2
-; GFX11-NEXT: .LBB23_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v16i32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-TRUE16-NEXT: .LBB23_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s27, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s26, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s17, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v2, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v17, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v17 :: v_dual_add_nc_u32 v3, v3, v4
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s16, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v17, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v19, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s12, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v17, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v19, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v18, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v16, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v19 :: v_dual_add_nc_u32 v19, v21, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v22, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v18, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v18, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v17.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB23_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB23_2
+; GFX11-TRUE16-NEXT: .LBB23_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v16i32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-FAKE16-NEXT: .LBB23_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s26, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s25, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s24, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s23, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s22, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s21, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s20, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v8, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v6, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s18, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v7, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v17, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v19, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s16, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v17, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v6, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s15, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v18, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v21, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v19, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, v20, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v22, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, v23, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v24, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v18, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v16, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v20, 16, v17
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB23_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB23_2
+; GFX11-FAKE16-NEXT: .LBB23_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -21906,360 +22242,696 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v32bf16_to_v16f32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s15, s3
-; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_mov_b32 s12, s0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB47_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB47_4
-; GFX11-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s0, s27, 16
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s2, s26, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_and_b32 s1, s25, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s3, s25, 16
-; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s24, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s23, 16
-; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_lshl_b32 s1, s22, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT: s_lshl_b32 s1, s21, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s20, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v7
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v16, v4, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v4
-; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s1, s18, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v0
-; GFX11-NEXT: s_and_b32 s1, s17, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v6, v4
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v5
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v17, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v19, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_lshl_b32 s1, s16, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s1
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v18, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v16, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v17, v18
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v6, v16
-; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v16
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_lshl_b32 s1, s15, 16
-; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v3, v17, 16, 1
-; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: s_lshl_b32 s0, s14, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v17
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s13, 16
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_lshl_b32 s0, s12, 16
-; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v21
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v19, v21, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v23, v17, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v18
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v17
-; GFX11-NEXT: v_bfe_u32 v24, v22, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24
-; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v18, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v2, v16, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v0, v20, 16, v17
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB47_3:
-; GFX11-NEXT: s_branch .LBB47_2
-; GFX11-NEXT: .LBB47_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v16f32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_4
+; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s27, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s26, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s17, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v2, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v17, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v17 :: v_dual_add_nc_u32 v3, v3, v4
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s16, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v17, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v19, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s12, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v17, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v19, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v18, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v16, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v19 :: v_dual_add_nc_u32 v19, v21, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v22, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v18, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v18, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v17.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB47_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB47_2
+; GFX11-TRUE16-NEXT: .LBB47_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v16f32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_4
+; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s26, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s25, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s24, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s23, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s22, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s21, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s20, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v8, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v6, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s18, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v7, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v17, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v19, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s16, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v17, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v6, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s15, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v18, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v21, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v19, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, v20, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v22, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, v23, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v24, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v18, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v16, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v20, 16, v17
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB47_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB47_2
+; GFX11-FAKE16-NEXT: .LBB47_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -35969,360 +36641,696 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v32bf16_to_v8i64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s15, s3
-; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_mov_b32 s12, s0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB67_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB67_4
-; GFX11-NEXT: .LBB67_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s0, s27, 16
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s2, s26, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_and_b32 s1, s25, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s3, s25, 16
-; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s24, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s23, 16
-; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_lshl_b32 s1, s22, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT: s_lshl_b32 s1, s21, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s20, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v7
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v16, v4, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v4
-; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s1, s18, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v0
-; GFX11-NEXT: s_and_b32 s1, s17, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v6, v4
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v5
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v17, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v19, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_lshl_b32 s1, s16, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s1
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v18, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v16, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v17, v18
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v6, v16
-; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v16
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_lshl_b32 s1, s15, 16
-; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v3, v17, 16, 1
-; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: s_lshl_b32 s0, s14, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v17
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s13, 16
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_lshl_b32 s0, s12, 16
-; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v21
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v19, v21, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v23, v17, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v18
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v17
-; GFX11-NEXT: v_bfe_u32 v24, v22, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24
-; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v18, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v2, v16, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v0, v20, 16, v17
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB67_3:
-; GFX11-NEXT: s_branch .LBB67_2
-; GFX11-NEXT: .LBB67_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v8i64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB67_4
+; GFX11-TRUE16-NEXT: .LBB67_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s27, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s26, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s17, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v2, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v17, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v17 :: v_dual_add_nc_u32 v3, v3, v4
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s16, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v17, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v19, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s12, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v17, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v19, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v18, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v16, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v19 :: v_dual_add_nc_u32 v19, v21, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v22, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v18, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v18, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v17.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB67_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB67_2
+; GFX11-TRUE16-NEXT: .LBB67_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v8i64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB67_4
+; GFX11-FAKE16-NEXT: .LBB67_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s26, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s25, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s24, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s23, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s22, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s21, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s20, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v8, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v6, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s18, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v7, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v17, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v19, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s16, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v17, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v6, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s15, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v18, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v21, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v19, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, v20, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v22, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, v23, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v24, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v18, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v16, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v20, 16, v17
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB67_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB67_2
+; GFX11-FAKE16-NEXT: .LBB67_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -49092,360 +50100,696 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v32bf16_to_v8f64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s15, s3
-; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_mov_b32 s12, s0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB83_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB83_4
-; GFX11-NEXT: .LBB83_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s0, s27, 16
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s2, s26, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_and_b32 s1, s25, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s3, s25, 16
-; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s24, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s23, 16
-; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_lshl_b32 s1, s22, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT: s_lshl_b32 s1, s21, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s20, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v7
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v16, v4, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v4
-; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s1, s18, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v0
-; GFX11-NEXT: s_and_b32 s1, s17, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v6, v4
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v5
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v17, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v19, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_lshl_b32 s1, s16, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s1
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v18, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v16, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v17, v18
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v6, v16
-; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v16
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_lshl_b32 s1, s15, 16
-; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v3, v17, 16, 1
-; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: s_lshl_b32 s0, s14, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v17
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s13, 16
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_lshl_b32 s0, s12, 16
-; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v21
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v19, v21, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v23, v17, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v18
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v17
-; GFX11-NEXT: v_bfe_u32 v24, v22, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24
-; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v18, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v2, v16, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v0, v20, 16, v17
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB83_3:
-; GFX11-NEXT: s_branch .LBB83_2
-; GFX11-NEXT: .LBB83_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v8f64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB83_4
+; GFX11-TRUE16-NEXT: .LBB83_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s27, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s26, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s17, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v2, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v17, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v17 :: v_dual_add_nc_u32 v3, v3, v4
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s16, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v17, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v19, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s12, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v17, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v19, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v18, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v16, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v19 :: v_dual_add_nc_u32 v19, v21, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v22, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v18, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v18, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v17.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB83_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB83_2
+; GFX11-TRUE16-NEXT: .LBB83_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v8f64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB83_4
+; GFX11-FAKE16-NEXT: .LBB83_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s26, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s25, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s24, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s23, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s22, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s21, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s20, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v8, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v6, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s18, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v7, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v17, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v19, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s16, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v17, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v6, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s15, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v18, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v21, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v19, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, v20, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v22, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, v23, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v24, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v18, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v16, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v20, 16, v17
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB83_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB83_2
+; GFX11-FAKE16-NEXT: .LBB83_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -60096,298 +61440,258 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v26, 16, v7
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v0 :: v_dual_lshlrev_b32 v19, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v9
-; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v16, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v16, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v16
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v18
-; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v17, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v0, v0, v16, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_add3_u32 v22, v22, v17, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v16, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_cndmask_b32 v0, v0, v22
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v18, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v18, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v20, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v19, v20, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v18, 16, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v21, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v12
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_cndmask_b32 v2, v21, v16
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v22
-; GFX11-TRUE16-NEXT: v_add3_u32 v16, v18, v20, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v19, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v20, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v22, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v18, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v18
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v22, v18, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v17.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v22, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v22, v23, v20, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v20
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v3, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v13
-; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v21, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v17, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v17, v18, v19, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v19
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v18, v20, v21, 0x7fff
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v22, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT: v_add3_u32 v19, v20, v22, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v19, v20, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v20, v21, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v18.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v22, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v21, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc_lo
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 0x40c00000, v23 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v23, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v23
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT: v_add3_u32 v20, v22, v23, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v4, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v21, v22, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v3, v19, v23
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v24, v21, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v22, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v20.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v21, v24, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v21, v22 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v24, 0x40c00000, v24 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v24, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v24
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-TRUE16-NEXT: v_add3_u32 v21, v23, v24, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v5, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v21, v22, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v22, v23, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v24, v25, v22, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v21, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v19.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v24, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v23, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc_lo
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 0x40c00000, v25 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.h
-; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v25, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v25
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v22, v24, v25, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v22, v23, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v23, v24, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v24, 0x40c00000, v24 :: v_dual_cndmask_b32 v5, v21, v25
+; GFX11-TRUE16-NEXT: v_add3_u32 v21, v26, v23, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v24, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v23, v26, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v21, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v23, v24 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v26, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v26
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v23, v25, v26, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v24, v25, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add3_u32 v26, v27, v24, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v23, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v23, v23, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v26, v27, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v25, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v24, v25 :: v_dual_and_b32 v8, 0xffff0000, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 0x40c00000, v27 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v27, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v27
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v24, v26, v27, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v8, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v24, v25, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v25, v26, v8, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_cndmask_b32 v7, v23, v27
+; GFX11-TRUE16-NEXT: v_add3_u32 v23, v28, v25, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v25
+; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v26, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v25, v28, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v23, v27, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v25, v26 :: v_dual_and_b32 v9, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v28, 0x40c00000, v28 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v28, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v28
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v25, v27, v28, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v9, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v26, v27, v9, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_add3_u32 v28, v29, v26, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v25, v27, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v22.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v25, v25, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v28, v29, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v27, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v26, v27 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 0x40c00000, v29 :: v_dual_add_f32 v10, 0x40c00000, v10
-; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v29, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v29
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v26, v28, v29, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v10, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v26, v27, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v27, v28, v10, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v28, 0x40c00000, v28 :: v_dual_cndmask_b32 v9, v25, v29
+; GFX11-TRUE16-NEXT: v_add3_u32 v25, v30, v27, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v27
+; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v28, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v27, v30, v10, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v25, v29, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v10
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v10, v27, v28 :: v_dual_and_b32 v11, 0xffff0000, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_add_f32 v11, 0x40c00000, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v30, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v30
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v26
-; GFX11-TRUE16-NEXT: v_add3_u32 v27, v29, v30, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v11, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v6, 16, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v27, v28, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v28, v29, v11, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_add3_u32 v30, v31, v28, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v28
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v27, v29, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v27, v27, v11, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v30, v31, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v29, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v28, v29 :: v_dual_and_b32 v12, 0xffff0000, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v31, 0x40c00000, v31 :: v_dual_add_f32 v12, 0x40c00000, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v31, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v12, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v27
-; GFX11-TRUE16-NEXT: v_add3_u32 v29, v30, v31, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v31
-; GFX11-TRUE16-NEXT: v_add3_u32 v28, v28, v12, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v13, 16, 1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v5, 16, v11
-; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v30, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v29, v32, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v27, v30, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v31
+; GFX11-TRUE16-NEXT: v_add3_u32 v30, v32, v29, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v29
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v27, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v30, v31, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v30, v32, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v12
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: v_add3_u32 v31, v31, v13, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v22
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v14
-; GFX11-TRUE16-NEXT: v_add3_u32 v28, v34, v30, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v31, v35 :: v_dual_add_f32 v32, 0x40c00000, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v30
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v13.h
-; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v32, 16, 1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v12.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v34, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v15, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v15
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v36, v32, 0x7fff
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v29
-; GFX11-TRUE16-NEXT: v_add3_u32 v31, v31, v15, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v32, v33, v27, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v13, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v5, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v16, 16, v21
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v4, 16, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v30, v31, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v33
+; GFX11-TRUE16-NEXT: v_add3_u32 v31, v35, v13, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v23
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v14, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v4, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v21, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v36, v37, v14, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v37, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v34.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v31, v39, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v36, v48, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v14.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v31, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v29.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v32, v34, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v30, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v26.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v34
+; GFX11-TRUE16-NEXT: v_add3_u32 v32, v33, v30, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v30
+; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v14, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v31
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v28, v33, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v1, 16, v14
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v28
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v25
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v3, 16, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v24
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v1, 16, v9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v3, 16, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v16, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v1, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v18, 16, v20
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v27.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v34, v14, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v31, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v31
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v15, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v33, v36, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v30.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v3, 16, v19
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v32, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v28.h
; GFX11-TRUE16-NEXT: .LBB94_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -61605,325 +62909,620 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v32bf16_to_v32i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s15, s3
-; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_mov_b32 s12, s0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB95_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB95_4
-; GFX11-NEXT: .LBB95_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s12, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s1, s13, 16
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT: s_and_b32 s2, s14, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: s_lshl_b32 s0, s14, 16
-; GFX11-NEXT: s_lshl_b32 s1, s27, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v2, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v5
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v3
-; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v7
-; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v6
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_lshl_b32 s0, s15, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v5, v6, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v4
-; GFX11-NEXT: v_bfe_u32 v7, v10, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s16, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v10
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_and_b32 s0, s17, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v6, v8 :: v_dual_add_nc_u32 v7, v9, v5
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v10
-; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v7, v9, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v11
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v6
-; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s18, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v12
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v21, v7, v8, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v8, v10 :: v_dual_add_nc_u32 v8, 0x7fff, v9
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v11
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v7
-; GFX11-NEXT: v_bfe_u32 v12, v13, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v7
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s20, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v13
-; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v14, v9, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v7, v10, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v9
-; GFX11-NEXT: v_bfe_u32 v14, v15, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v14, v15
-; GFX11-NEXT: v_cndmask_b32_e32 v22, v10, v11, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_lshl_b32 s0, s21, 16
-; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s0
-; GFX11-NEXT: v_dual_cndmask_b32 v24, v11, v12 :: v_dual_add_nc_u32 v9, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v15
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10
-; GFX11-NEXT: v_bfe_u32 v13, v23, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s22, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v23
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v15, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v13
-; GFX11-NEXT: v_bfe_u32 v13, v25, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v12, v14, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v15, v11
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v23
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_bfe_u32 v28, v14, 16, 1
-; GFX11-NEXT: s_lshl_b32 s0, s23, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v23, v10, v15, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v25
-; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v27, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v28, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_add_f32_e64 v27, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v14
-; GFX11-NEXT: v_bfe_u32 v29, v15, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v13 :: v_dual_add_nc_u32 v12, 0x7fff, v12
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: s_lshl_b32 s0, s24, 16
-; GFX11-NEXT: v_bfe_u32 v13, v27, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s25, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v25, v12, v28 :: v_dual_add_nc_u32 v12, v29, v15
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v27
-; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v15
-; GFX11-NEXT: v_bfe_u32 v30, v14, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v27
-; GFX11-NEXT: v_bfe_u32 v32, v28, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v12, v12, v29 :: v_dual_add_nc_u32 v15, v30, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT: v_or_b32_e32 v30, 0x400000, v14
-; GFX11-NEXT: s_lshl_b32 s0, s25, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v27, v13, v31, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v32, v28
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v28
-; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v32, v29, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v14, v15, v30 :: v_dual_add_nc_u32 v13, 0x7fff, v13
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s26, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v32, v29
-; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v31, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s1
-; GFX11-NEXT: s_and_b32 s0, s27, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v28, v33, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v34, v30, 16, 1
-; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v31
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v31
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v30
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v28, v28, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v29
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v35, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v33
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v34, v48, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v37, v49, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT: v_and_or_b32 v7, 0xffff0000, v7, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v29, v15, v36, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_and_or_b32 v15, 0xffff0000, v32, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v38, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; GFX11-NEXT: v_and_or_b32 v12, 0xffff0000, v27, v31
-; GFX11-NEXT: v_and_or_b32 v11, 0xffff0000, v25, v32
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v9
-; GFX11-NEXT: v_and_or_b32 v14, 0xffff0000, v28, v30
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v6
-; GFX11-NEXT: v_and_or_b32 v9, 0xffff0000, v26, v23
-; GFX11-NEXT: v_and_or_b32 v8, 0xffff0000, v24, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v2
-; GFX11-NEXT: v_and_or_b32 v6, 0xffff0000, v5, v27
-; GFX11-NEXT: v_and_or_b32 v5, 0xffff0000, v21, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v1
-; GFX11-NEXT: v_and_or_b32 v13, 0xffff0000, v13, v29
-; GFX11-NEXT: v_and_or_b32 v10, 0xffff0000, v10, v33
-; GFX11-NEXT: v_and_or_b32 v4, 0xffff0000, v19, v20
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v18, v21
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v17, v22
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v16, v23
-; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v24
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB95_3:
-; GFX11-NEXT: s_branch .LBB95_2
-; GFX11-NEXT: .LBB95_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v32i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB95_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB95_4
+; GFX11-TRUE16-NEXT: .LBB95_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s12, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v16.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v2, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v2, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v3, v7 :: v_dual_add_nc_u32 v5, v5, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v4, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v8
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v5, v9 :: v_dual_add_nc_u32 v5, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v20.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v6, v11 :: v_dual_add_nc_u32 v6, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v6, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v10
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v7, v11 :: v_dual_add_nc_u32 v7, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v7, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v22.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v8, v13 :: v_dual_add_nc_u32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v8, v11
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v12
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v9, v13 :: v_dual_add_nc_u32 v9, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v24, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v13, v24
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v9, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v25, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v26.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v10, v15 :: v_dual_add_nc_u32 v10, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v25
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v25
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v10, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v28, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v15, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v27
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, v25, v28
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v12, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v27, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v25
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v28
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v29, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v12, v27
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v13, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v25, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v13, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v14, v25
+; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v31, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v12, v29, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v27, v31
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v28.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v13, v29, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s26, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v29.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v14, v32 :: v_dual_add_nc_u32 v14, 0x7fff, v27
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v31
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v33, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v27, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v33, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v14, v32, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v34, v27
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v35, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v36, v33
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, v25, v15
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v14, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x7fff, v25
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v32, v35
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v25, v36, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v27.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v32, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v33.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB95_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB95_2
+; GFX11-TRUE16-NEXT: .LBB95_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v32i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB95_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB95_4
+; GFX11-FAKE16-NEXT: .LBB95_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s12, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s13, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s14, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s27, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v6, v8 :: v_dual_add_nc_u32 v7, v9, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v8, v10 :: v_dual_add_nc_u32 v8, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v10, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v15, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v11, v12 :: v_dual_add_nc_u32 v9, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v15
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v23, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v23
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v12, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v15, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v23
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v14, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v10, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v25
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v12, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v28, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v15, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v11, v13 :: v_dual_add_nc_u32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v27, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v25, v12, v28 :: v_dual_add_nc_u32 v12, v29, v15
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v27
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v15
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v14, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v27
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v28, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v12, v29 :: v_dual_add_nc_u32 v15, v30, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v14
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v13, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v32, v28
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v28
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v29, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v15, v30 :: v_dual_add_nc_u32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v33, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s26, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v32, v29
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v30, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v31
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v34, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, v35, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v28, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v29
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v35, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v37, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: v_and_or_b32 v7, 0xffff0000, v7, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v15, v36, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_and_or_b32 v15, 0xffff0000, v32, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT: v_and_or_b32 v12, 0xffff0000, v27, v31
+; GFX11-FAKE16-NEXT: v_and_or_b32 v11, 0xffff0000, v25, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v9
+; GFX11-FAKE16-NEXT: v_and_or_b32 v14, 0xffff0000, v28, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT: v_and_or_b32 v9, 0xffff0000, v26, v23
+; GFX11-FAKE16-NEXT: v_and_or_b32 v8, 0xffff0000, v24, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v6, 0xffff0000, v5, v27
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, 0xffff0000, v21, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; GFX11-FAKE16-NEXT: v_and_or_b32 v13, 0xffff0000, v13, v29
+; GFX11-FAKE16-NEXT: v_and_or_b32 v10, 0xffff0000, v10, v33
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0xffff0000, v19, v20
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v18, v21
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v17, v22
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v16, v23
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v24
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB95_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB95_2
+; GFX11-FAKE16-NEXT: .LBB95_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -69965,170 +71564,106 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s23, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v39
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v38
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v31
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v31
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v48
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v82
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v5, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v6, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v24
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v49
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v50
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v53
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v20
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v3, v54
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v26
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v23
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v65
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v12, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v69
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v65
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, v12, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v30
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v2, v27
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v29
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v1, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v69
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v80
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v71
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v84
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v3, v83
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB99_3
; GFX11-TRUE16-NEXT: .LBB99_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v65
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v71, v5
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v66, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v28
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v29, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v53, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v33
; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
@@ -70137,11 +71672,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
@@ -70159,83 +71689,133 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v51, v4
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v50, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v49, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v84, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v85, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v83, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v70, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_add_nc_u32 v13, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v48, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v85, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v71, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v29, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v23, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v19, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v17, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v53, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v51, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v50, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v49, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v48, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v19
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v20.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v19, 0x300, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v18.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v3.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3
; GFX11-TRUE16-NEXT: .LBB99_3: ; %end
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -73991,358 +75571,687 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v32bf16_to_v32f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s15, s3
-; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_mov_b32 s12, s0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB103_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB103_4
-; GFX11-NEXT: .LBB103_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s12, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s1, s13, 16
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: s_and_b32 s2, s14, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s0, s14, 16
-; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v9, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT: v_bfe_u32 v6, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s15, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v2
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v33
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v3, v4 :: v_dual_add_nc_u32 v3, v6, v7
-; GFX11-NEXT: v_bfe_u32 v4, v8, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v8
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v5
-; GFX11-NEXT: v_bfe_u32 v36, v33, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v3
-; GFX11-NEXT: v_bfe_u32 v3, v6, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v3, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_lshl_b32 s0, s16, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v9, v4, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_and_b32 s0, s17, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v4
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT: v_bfe_u32 v10, v9, 16, 1
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v6, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v9
-; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s18, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v11, v6
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v7
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v11, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_bfe_u32 v13, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v10
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v13, v8
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s20, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v9, v9, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v8
-; GFX11-NEXT: v_bfe_u32 v12, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v9
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v13
-; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: s_lshl_b32 s0, s21, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT: v_bfe_u32 v15, v12, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v10, v10, v14 :: v_dual_add_nc_u32 v11, 0x7fff, v11
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v12
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v10
-; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v15
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v12
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v13
-; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s22, 16
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v13
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_bfe_u32 v14, v15, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v15
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v11
-; GFX11-NEXT: v_bfe_u32 v26, v14, 16, 1
-; GFX11-NEXT: s_lshl_b32 s0, s23, 16
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v26, v26, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v27, v12, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v26
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v26, v27, v12
-; GFX11-NEXT: v_add_f32_e64 v27, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s24, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v26
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v12
-; GFX11-NEXT: v_bfe_u32 v26, v27, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v13
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s25, 0xffff0000
-; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v27
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v26, v27
-; GFX11-NEXT: v_bfe_u32 v15, v13, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT: s_lshl_b32 s0, s25, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v13
-; GFX11-NEXT: v_bfe_u32 v30, v26, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v27, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v29 :: v_dual_add_nc_u32 v15, 0x7fff, v15
-; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v30, v30, v26
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT: v_bfe_u32 v14, v27, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v15, v29, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v30
-; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v26
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s27, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v14, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX11-NEXT: v_cndmask_b32_e32 v15, v15, v29, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s26, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT: v_bfe_u32 v34, v29, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v29
-; GFX11-NEXT: v_bfe_u32 v26, v30, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v14, v14, v32, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v34, v29
-; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v33
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT: v_add_nc_u32_e32 v26, v26, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v37 :: v_dual_add_nc_u32 v26, 0x7fff, v26
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v29, 0x7fff, v32
-; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_cndmask_b32_e32 v29, v29, v32, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v33
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v36, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v15, v30, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v11, v28, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v12, v31, 16, v30
-; GFX11-NEXT: v_lshl_or_b32 v10, v10, 16, v34
-; GFX11-NEXT: v_lshl_or_b32 v14, v26, 16, v27
-; GFX11-NEXT: v_lshl_or_b32 v13, v32, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v5, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v9, v24, 16, v25
-; GFX11-NEXT: v_lshl_or_b32 v8, v23, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v7, v22, 16, v27
-; GFX11-NEXT: v_lshl_or_b32 v5, v21, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v3, v18, 16, v19
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v2, v17, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v1, v16, 16, v22
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v23
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB103_3:
-; GFX11-NEXT: s_branch .LBB103_2
-; GFX11-NEXT: .LBB103_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v32f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB103_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB103_4
+; GFX11-TRUE16-NEXT: .LBB103_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s12, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s13, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s14, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v9, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v6, v2 :: v_dual_add_nc_u32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v33, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v29
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v33, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v2, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v34, v33
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v18.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v6, v7 :: v_dual_add_nc_u32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v6 :: v_dual_add_nc_u32 v4, v7, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v4, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v6, v8 :: v_dual_add_nc_u32 v6, v9, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v12, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v6, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v22.l
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v8, v10 :: v_dual_add_nc_u32 v8, v11, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v14, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v9, v13 :: v_dual_add_nc_u32 v12, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v8, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v24.l
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v10, v12 :: v_dual_add_nc_u32 v10, v13, v15
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, v26, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v11, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v26
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, v10, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v26
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v28, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v27.l
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v12, v12, v14 :: v_dual_add_nc_u32 v13, v15, v28
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v26, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v29, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v25.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v14, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v30.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v14, v28, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s26, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, v32, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v31, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v26, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v28
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, v31, v26
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v29.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v31, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v31
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v28, v35, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v33
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, v36, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v23.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v33
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v36, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v19.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v28, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v31.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB103_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB103_2
+; GFX11-TRUE16-NEXT: .LBB103_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v32f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB103_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB103_4
+; GFX11-FAKE16-NEXT: .LBB103_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s12, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s13, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s14, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v33, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v9, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v3, v4 :: v_dual_add_nc_u32 v3, v6, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v3, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v11, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v13, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v9, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v13
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v14 :: v_dual_add_nc_u32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v13
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v15, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v14, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, v26, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, v27, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v27, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v26, v27
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v13
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v29 :: v_dual_add_nc_u32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v30, v26
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v27, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v15, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v14, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v15, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s26, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v35, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v29, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29
+; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v30, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v32, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v34, v29
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v33
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, v26, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v37 :: v_dual_add_nc_u32 v26, 0x7fff, v26
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v29, v32, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v36, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v30, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v28, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v31, 16, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v34
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v26, 16, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v32, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v5, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v24, 16, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v23, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v21, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v18, 16, v19
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v17, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v16, 16, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v23
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB103_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB103_2
+; GFX11-FAKE16-NEXT: .LBB103_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -81928,170 +83837,106 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s23, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v39
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v38
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v31
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v31
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v48
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v82
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v5, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v6, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v24
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v49
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v50
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v53
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v20
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v3, v54
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v26
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v23
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v65
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v12, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v69
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v65
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, v12, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v30
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v2, v27
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v29
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v1, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v69
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v80
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v71
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v84
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v3, v83
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB107_3
; GFX11-TRUE16-NEXT: .LBB107_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v65
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v71, v5
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v66, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v28
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v29, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v53, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v33
; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
@@ -82100,11 +83945,6 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
@@ -82122,83 +83962,133 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v51, v4
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v50, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v49, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v84, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v85, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v83, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v70, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_add_nc_u32 v13, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v48, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v85, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v71, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v29, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v23, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v19, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v17, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v53, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v51, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v50, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v49, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v48, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v19
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v20.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v19, 0x300, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v18.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v3.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3
; GFX11-TRUE16-NEXT: .LBB107_3: ; %end
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -92186,170 +94076,106 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s23, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v39
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v38
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v31
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v31
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v48
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v82
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v5, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v6, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v24
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v49
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v50
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v53
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v20
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v3, v54
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v26
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v23
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v65
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v12, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v69
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v65
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, v12, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v30
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v2, v27
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v29
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v1, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v69
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v80
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v71
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v84
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v3, v83
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB111_3
; GFX11-TRUE16-NEXT: .LBB111_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v65
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v71, v5
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v66, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v28
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v29, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v53, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v33
; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
@@ -92358,11 +94184,6 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
@@ -92380,83 +94201,133 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v51, v4
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v50, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v49, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v84, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v85, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v83, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v70, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_add_nc_u32 v13, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v48, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v85, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v71, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v29, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v23, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v19, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v17, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v53, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v51, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v50, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v49, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v48, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v19
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v20.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v19, 0x300, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v18.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v3.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3
; GFX11-TRUE16-NEXT: .LBB111_3: ; %end
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index a1c0a87..5d4df4b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -10227,149 +10227,285 @@ define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr22
; GFX9-NEXT: s_branch .LBB29_2
;
-; GFX11-LABEL: bitcast_v18f32_to_v36i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB29_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s5, s29, 16
-; GFX11-NEXT: s_lshr_b32 s6, s28, 16
-; GFX11-NEXT: s_lshr_b32 s7, s27, 16
-; GFX11-NEXT: s_lshr_b32 s8, s26, 16
-; GFX11-NEXT: s_lshr_b32 s9, s25, 16
-; GFX11-NEXT: s_lshr_b32 s10, s24, 16
-; GFX11-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-NEXT: s_lshr_b32 s12, s22, 16
-; GFX11-NEXT: s_lshr_b32 s13, s21, 16
-; GFX11-NEXT: s_lshr_b32 s14, s20, 16
-; GFX11-NEXT: s_lshr_b32 s15, s19, 16
-; GFX11-NEXT: s_lshr_b32 s40, s18, 16
-; GFX11-NEXT: s_lshr_b32 s41, s17, 16
-; GFX11-NEXT: s_lshr_b32 s42, s16, 16
-; GFX11-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB29_4
-; GFX11-NEXT: .LBB29_2: ; %cmp.true
-; GFX11-NEXT: v_add_f32_e64 v13, s29, 1.0
-; GFX11-NEXT: v_add_f32_e64 v14, s28, 1.0
-; GFX11-NEXT: v_add_f32_e64 v15, s27, 1.0
-; GFX11-NEXT: v_add_f32_e64 v16, s26, 1.0
-; GFX11-NEXT: v_add_f32_e64 v17, s25, 1.0
-; GFX11-NEXT: v_add_f32_e64 v8, s24, 1.0
-; GFX11-NEXT: v_add_f32_e64 v9, s23, 1.0
-; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0
-; GFX11-NEXT: v_add_f32_e64 v11, s21, 1.0
-; GFX11-NEXT: v_add_f32_e64 v12, s20, 1.0
-; GFX11-NEXT: v_add_f32_e64 v3, s19, 1.0
-; GFX11-NEXT: v_add_f32_e64 v4, s18, 1.0
-; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0
-; GFX11-NEXT: v_add_f32_e64 v6, s16, 1.0
-; GFX11-NEXT: v_add_f32_e64 v7, s3, 1.0
-; GFX11-NEXT: v_add_f32_e64 v0, s2, 1.0
-; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0
-; GFX11-NEXT: v_add_f32_e64 v2, s0, 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT: s_branch .LBB29_5
-; GFX11-NEXT: .LBB29_3:
-; GFX11-NEXT: ; implicit-def: $sgpr46
-; GFX11-NEXT: ; implicit-def: $sgpr45
-; GFX11-NEXT: ; implicit-def: $sgpr44
-; GFX11-NEXT: ; implicit-def: $sgpr43
-; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: ; implicit-def: $sgpr41
-; GFX11-NEXT: ; implicit-def: $sgpr40
-; GFX11-NEXT: ; implicit-def: $sgpr15
-; GFX11-NEXT: ; implicit-def: $sgpr14
-; GFX11-NEXT: ; implicit-def: $sgpr13
-; GFX11-NEXT: ; implicit-def: $sgpr12
-; GFX11-NEXT: ; implicit-def: $sgpr11
-; GFX11-NEXT: ; implicit-def: $sgpr10
-; GFX11-NEXT: ; implicit-def: $sgpr9
-; GFX11-NEXT: ; implicit-def: $sgpr8
-; GFX11-NEXT: ; implicit-def: $sgpr7
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: ; implicit-def: $sgpr5
-; GFX11-NEXT: s_branch .LBB29_2
-; GFX11-NEXT: .LBB29_4:
-; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
-; GFX11-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v3, s19
-; GFX11-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v11, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v9, s23
-; GFX11-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v17, s25
-; GFX11-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v13, s29
-; GFX11-NEXT: v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
-; GFX11-NEXT: v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
-; GFX11-NEXT: v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
-; GFX11-NEXT: v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
-; GFX11-NEXT: v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
-; GFX11-NEXT: v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
-; GFX11-NEXT: v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
-; GFX11-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
-; GFX11-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
-; GFX11-NEXT: .LBB29_5: ; %end
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v0, v35, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v1, v34, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v2, v33, 16, v36
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v32, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v4, v31, 16, v6
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v6, v29, 16, v33
-; GFX11-NEXT: v_lshl_or_b32 v7, v28, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v8, v27, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v9, v26, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v24, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v12, v23, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v5, v30, 16, v5
-; GFX11-NEXT: v_lshl_or_b32 v10, v25, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v13, v22, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v14, v21, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v15, v20, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v19, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v17, v18, 16, v24
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v18f32_to_v36i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_4
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, s29, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, s28, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, s27, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, s26, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, s25, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, s24, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, s23, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, s22, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, s21, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, s20, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, s19, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, s18, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, s17, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s16, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s3, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, s2, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, s1, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, s0, 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-TRUE16-NEXT: s_branch .LBB29_5
+; GFX11-TRUE16-NEXT: .LBB29_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT: s_branch .LBB29_2
+; GFX11-TRUE16-NEXT: .LBB29_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
+; GFX11-TRUE16-NEXT: .LBB29_5: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v19 :: v_dual_mov_b32 v18, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v18.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v18f32_to_v36i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_4
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, s29, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, s28, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, s27, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, s26, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, s25, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, s24, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, s23, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, s22, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, s21, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, s20, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, s19, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, s18, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, s17, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, s16, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, s3, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, s2, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, s1, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, s0, 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT: s_branch .LBB29_5
+; GFX11-FAKE16-NEXT: .LBB29_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT: s_branch .LBB29_2
+; GFX11-FAKE16-NEXT: .LBB29_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v3, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v11, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v9, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v17, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v13, s29
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
+; GFX11-FAKE16-NEXT: .LBB29_5: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -12999,149 +13135,285 @@ define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr22
; GFX9-NEXT: s_branch .LBB33_2
;
-; GFX11-LABEL: bitcast_v18f32_to_v36f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB33_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s5, s29, 16
-; GFX11-NEXT: s_lshr_b32 s6, s28, 16
-; GFX11-NEXT: s_lshr_b32 s7, s27, 16
-; GFX11-NEXT: s_lshr_b32 s8, s26, 16
-; GFX11-NEXT: s_lshr_b32 s9, s25, 16
-; GFX11-NEXT: s_lshr_b32 s10, s24, 16
-; GFX11-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-NEXT: s_lshr_b32 s12, s22, 16
-; GFX11-NEXT: s_lshr_b32 s13, s21, 16
-; GFX11-NEXT: s_lshr_b32 s14, s20, 16
-; GFX11-NEXT: s_lshr_b32 s15, s19, 16
-; GFX11-NEXT: s_lshr_b32 s40, s18, 16
-; GFX11-NEXT: s_lshr_b32 s41, s17, 16
-; GFX11-NEXT: s_lshr_b32 s42, s16, 16
-; GFX11-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB33_4
-; GFX11-NEXT: .LBB33_2: ; %cmp.true
-; GFX11-NEXT: v_add_f32_e64 v13, s29, 1.0
-; GFX11-NEXT: v_add_f32_e64 v14, s28, 1.0
-; GFX11-NEXT: v_add_f32_e64 v15, s27, 1.0
-; GFX11-NEXT: v_add_f32_e64 v16, s26, 1.0
-; GFX11-NEXT: v_add_f32_e64 v17, s25, 1.0
-; GFX11-NEXT: v_add_f32_e64 v8, s24, 1.0
-; GFX11-NEXT: v_add_f32_e64 v9, s23, 1.0
-; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0
-; GFX11-NEXT: v_add_f32_e64 v11, s21, 1.0
-; GFX11-NEXT: v_add_f32_e64 v12, s20, 1.0
-; GFX11-NEXT: v_add_f32_e64 v3, s19, 1.0
-; GFX11-NEXT: v_add_f32_e64 v4, s18, 1.0
-; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0
-; GFX11-NEXT: v_add_f32_e64 v6, s16, 1.0
-; GFX11-NEXT: v_add_f32_e64 v7, s3, 1.0
-; GFX11-NEXT: v_add_f32_e64 v0, s2, 1.0
-; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0
-; GFX11-NEXT: v_add_f32_e64 v2, s0, 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT: s_branch .LBB33_5
-; GFX11-NEXT: .LBB33_3:
-; GFX11-NEXT: ; implicit-def: $sgpr46
-; GFX11-NEXT: ; implicit-def: $sgpr45
-; GFX11-NEXT: ; implicit-def: $sgpr44
-; GFX11-NEXT: ; implicit-def: $sgpr43
-; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: ; implicit-def: $sgpr41
-; GFX11-NEXT: ; implicit-def: $sgpr40
-; GFX11-NEXT: ; implicit-def: $sgpr15
-; GFX11-NEXT: ; implicit-def: $sgpr14
-; GFX11-NEXT: ; implicit-def: $sgpr13
-; GFX11-NEXT: ; implicit-def: $sgpr12
-; GFX11-NEXT: ; implicit-def: $sgpr11
-; GFX11-NEXT: ; implicit-def: $sgpr10
-; GFX11-NEXT: ; implicit-def: $sgpr9
-; GFX11-NEXT: ; implicit-def: $sgpr8
-; GFX11-NEXT: ; implicit-def: $sgpr7
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: ; implicit-def: $sgpr5
-; GFX11-NEXT: s_branch .LBB33_2
-; GFX11-NEXT: .LBB33_4:
-; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
-; GFX11-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v3, s19
-; GFX11-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v11, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v9, s23
-; GFX11-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v17, s25
-; GFX11-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v13, s29
-; GFX11-NEXT: v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
-; GFX11-NEXT: v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
-; GFX11-NEXT: v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
-; GFX11-NEXT: v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
-; GFX11-NEXT: v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
-; GFX11-NEXT: v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
-; GFX11-NEXT: v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
-; GFX11-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
-; GFX11-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
-; GFX11-NEXT: .LBB33_5: ; %end
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v0, v35, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v1, v34, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v2, v33, 16, v36
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v32, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v4, v31, 16, v6
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v6, v29, 16, v33
-; GFX11-NEXT: v_lshl_or_b32 v7, v28, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v8, v27, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v9, v26, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v24, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v12, v23, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v5, v30, 16, v5
-; GFX11-NEXT: v_lshl_or_b32 v10, v25, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v13, v22, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v14, v21, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v15, v20, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v19, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v17, v18, 16, v24
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v18f32_to_v36f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_4
+; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, s29, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, s28, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, s27, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, s26, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, s25, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, s24, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, s23, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, s22, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, s21, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, s20, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, s19, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, s18, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, s17, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s16, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s3, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, s2, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, s1, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, s0, 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-TRUE16-NEXT: s_branch .LBB33_5
+; GFX11-TRUE16-NEXT: .LBB33_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT: s_branch .LBB33_2
+; GFX11-TRUE16-NEXT: .LBB33_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
+; GFX11-TRUE16-NEXT: .LBB33_5: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v19 :: v_dual_mov_b32 v18, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v18.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v18f32_to_v36f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_4
+; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, s29, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, s28, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, s27, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, s26, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, s25, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, s24, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, s23, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, s22, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, s21, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, s20, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, s19, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, s18, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, s17, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, s16, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, s3, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, s2, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, s1, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, s0, 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT: s_branch .LBB33_5
+; GFX11-FAKE16-NEXT: .LBB33_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT: s_branch .LBB33_2
+; GFX11-FAKE16-NEXT: .LBB33_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v3, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v11, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v9, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v17, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v13, s29
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
+; GFX11-FAKE16-NEXT: .LBB33_5: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -21895,140 +22167,270 @@ define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i
; GFX9-NEXT: ; implicit-def: $vgpr22
; GFX9-NEXT: s_branch .LBB49_2
;
-; GFX11-LABEL: bitcast_v9f64_to_v36i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB49_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s5, s29, 16
-; GFX11-NEXT: s_lshr_b32 s14, s28, 16
-; GFX11-NEXT: s_lshr_b32 s6, s27, 16
-; GFX11-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-NEXT: s_lshr_b32 s7, s25, 16
-; GFX11-NEXT: s_lshr_b32 s40, s24, 16
-; GFX11-NEXT: s_lshr_b32 s8, s23, 16
-; GFX11-NEXT: s_lshr_b32 s41, s22, 16
-; GFX11-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-NEXT: s_lshr_b32 s42, s20, 16
-; GFX11-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-NEXT: s_lshr_b32 s43, s18, 16
-; GFX11-NEXT: s_lshr_b32 s11, s17, 16
-; GFX11-NEXT: s_lshr_b32 s44, s16, 16
-; GFX11-NEXT: s_lshr_b32 s12, s3, 16
-; GFX11-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-NEXT: s_lshr_b32 s13, s1, 16
-; GFX11-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_4
-; GFX11-NEXT: .LBB49_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[13:14], s[28:29], 1.0
-; GFX11-NEXT: v_add_f64 v[15:16], s[26:27], 1.0
-; GFX11-NEXT: v_add_f64 v[17:18], s[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], s[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], s[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[3:4], s[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[5:6], s[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[19:20], s[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v0
-; GFX11-NEXT: s_branch .LBB49_5
-; GFX11-NEXT: .LBB49_3:
-; GFX11-NEXT: ; implicit-def: $sgpr46
-; GFX11-NEXT: ; implicit-def: $sgpr13
-; GFX11-NEXT: ; implicit-def: $sgpr45
-; GFX11-NEXT: ; implicit-def: $sgpr12
-; GFX11-NEXT: ; implicit-def: $sgpr44
-; GFX11-NEXT: ; implicit-def: $sgpr11
-; GFX11-NEXT: ; implicit-def: $sgpr43
-; GFX11-NEXT: ; implicit-def: $sgpr10
-; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: ; implicit-def: $sgpr9
-; GFX11-NEXT: ; implicit-def: $sgpr41
-; GFX11-NEXT: ; implicit-def: $sgpr8
-; GFX11-NEXT: ; implicit-def: $sgpr40
-; GFX11-NEXT: ; implicit-def: $sgpr7
-; GFX11-NEXT: ; implicit-def: $sgpr15
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: ; implicit-def: $sgpr14
-; GFX11-NEXT: ; implicit-def: $sgpr5
-; GFX11-NEXT: s_branch .LBB49_2
-; GFX11-NEXT: .LBB49_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v19, s2
-; GFX11-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v10, s20
-; GFX11-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v8, s22
-; GFX11-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v20, s3
-; GFX11-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v6, s17
-; GFX11-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v4, s19
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v18, s25
-; GFX11-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v16, s27
-; GFX11-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v14, s29
-; GFX11-NEXT: v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v7, s43
-; GFX11-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v27, s42
-; GFX11-NEXT: v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v21, s14
-; GFX11-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v35, s13
-; GFX11-NEXT: v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v33, s12
-; GFX11-NEXT: v_dual_mov_b32 v22, s15 :: v_dual_mov_b32 v31, s10
-; GFX11-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v29, s9
-; GFX11-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v25, s7
-; GFX11-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v23, s5
-; GFX11-NEXT: .LBB49_5: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v34, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v3, v33, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v30, 16, v5
-; GFX11-NEXT: v_lshl_or_b32 v5, v32, 16, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v7, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v8, v27, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v26, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v12, v12, 16, v17
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v1, v35, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v7, v31, 16, v34
-; GFX11-NEXT: v_lshl_or_b32 v9, v29, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v28, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v13, v25, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v14, v22, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v15, v24, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v16, v21, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v17, v23, 16, v19
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v9f64_to_v36i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_4
+; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], s[28:29], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], s[26:27], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], s[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], s[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], s[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], s[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], s[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], s[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], s[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0
+; GFX11-TRUE16-NEXT: s_branch .LBB49_5
+; GFX11-TRUE16-NEXT: .LBB49_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT: s_branch .LBB49_2
+; GFX11-TRUE16-NEXT: .LBB49_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v35, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s45 :: v_dual_mov_b32 v33, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v31, s11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v27, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s41 :: v_dual_mov_b32 v25, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s40 :: v_dual_mov_b32 v23, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s15 :: v_dual_mov_b32 v21, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s14 :: v_dual_mov_b32 v19, s5
+; GFX11-TRUE16-NEXT: .LBB49_5: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v32 :: v_dual_mov_b32 v31, v31
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v22, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v34 :: v_dual_mov_b32 v35, v35
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v32.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v33 :: v_dual_mov_b32 v27, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v30 :: v_dual_mov_b32 v29, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v28 :: v_dual_mov_b32 v25, v25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v26 :: v_dual_mov_b32 v21, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v24 :: v_dual_mov_b32 v19, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v22, v23
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v20, v20
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v18, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v19.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v9f64_to_v36i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_4
+; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], s[28:29], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], s[26:27], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], s[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], s[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], s[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], s[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], s[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], s[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], s[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0
+; GFX11-FAKE16-NEXT: s_branch .LBB49_5
+; GFX11-FAKE16-NEXT: .LBB49_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT: s_branch .LBB49_2
+; GFX11-FAKE16-NEXT: .LBB49_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v19, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v10, s20
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v8, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v20, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v6, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v4, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v18, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v16, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v14, s29
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v7, s43
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v27, s42
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v21, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v35, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v33, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s15 :: v_dual_mov_b32 v31, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v29, s9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v25, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v23, s5
+; GFX11-FAKE16-NEXT: .LBB49_5: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v34, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v33, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v32, 16, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v26, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v35, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v31, 16, v34
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v29, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v22, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v23, 16, v19
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -24595,140 +24997,270 @@ define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr22
; GFX9-NEXT: s_branch .LBB53_2
;
-; GFX11-LABEL: bitcast_v9f64_to_v36f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB53_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s5, s29, 16
-; GFX11-NEXT: s_lshr_b32 s14, s28, 16
-; GFX11-NEXT: s_lshr_b32 s6, s27, 16
-; GFX11-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-NEXT: s_lshr_b32 s7, s25, 16
-; GFX11-NEXT: s_lshr_b32 s40, s24, 16
-; GFX11-NEXT: s_lshr_b32 s8, s23, 16
-; GFX11-NEXT: s_lshr_b32 s41, s22, 16
-; GFX11-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-NEXT: s_lshr_b32 s42, s20, 16
-; GFX11-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-NEXT: s_lshr_b32 s43, s18, 16
-; GFX11-NEXT: s_lshr_b32 s11, s17, 16
-; GFX11-NEXT: s_lshr_b32 s44, s16, 16
-; GFX11-NEXT: s_lshr_b32 s12, s3, 16
-; GFX11-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-NEXT: s_lshr_b32 s13, s1, 16
-; GFX11-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_4
-; GFX11-NEXT: .LBB53_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[13:14], s[28:29], 1.0
-; GFX11-NEXT: v_add_f64 v[15:16], s[26:27], 1.0
-; GFX11-NEXT: v_add_f64 v[17:18], s[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], s[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], s[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[3:4], s[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[5:6], s[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[19:20], s[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v0
-; GFX11-NEXT: s_branch .LBB53_5
-; GFX11-NEXT: .LBB53_3:
-; GFX11-NEXT: ; implicit-def: $sgpr46
-; GFX11-NEXT: ; implicit-def: $sgpr13
-; GFX11-NEXT: ; implicit-def: $sgpr45
-; GFX11-NEXT: ; implicit-def: $sgpr12
-; GFX11-NEXT: ; implicit-def: $sgpr44
-; GFX11-NEXT: ; implicit-def: $sgpr11
-; GFX11-NEXT: ; implicit-def: $sgpr43
-; GFX11-NEXT: ; implicit-def: $sgpr10
-; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: ; implicit-def: $sgpr9
-; GFX11-NEXT: ; implicit-def: $sgpr41
-; GFX11-NEXT: ; implicit-def: $sgpr8
-; GFX11-NEXT: ; implicit-def: $sgpr40
-; GFX11-NEXT: ; implicit-def: $sgpr7
-; GFX11-NEXT: ; implicit-def: $sgpr15
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: ; implicit-def: $sgpr14
-; GFX11-NEXT: ; implicit-def: $sgpr5
-; GFX11-NEXT: s_branch .LBB53_2
-; GFX11-NEXT: .LBB53_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v19, s2
-; GFX11-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v10, s20
-; GFX11-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v8, s22
-; GFX11-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v20, s3
-; GFX11-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v6, s17
-; GFX11-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v4, s19
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v18, s25
-; GFX11-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v16, s27
-; GFX11-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v14, s29
-; GFX11-NEXT: v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v7, s43
-; GFX11-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v27, s42
-; GFX11-NEXT: v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v21, s14
-; GFX11-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v35, s13
-; GFX11-NEXT: v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v33, s12
-; GFX11-NEXT: v_dual_mov_b32 v22, s15 :: v_dual_mov_b32 v31, s10
-; GFX11-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v29, s9
-; GFX11-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v25, s7
-; GFX11-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v23, s5
-; GFX11-NEXT: .LBB53_5: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v34, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v3, v33, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v30, 16, v5
-; GFX11-NEXT: v_lshl_or_b32 v5, v32, 16, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v7, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v8, v27, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v26, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v12, v12, 16, v17
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v1, v35, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v7, v31, 16, v34
-; GFX11-NEXT: v_lshl_or_b32 v9, v29, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v28, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v13, v25, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v14, v22, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v15, v24, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v16, v21, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v17, v23, 16, v19
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v9f64_to_v36f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_4
+; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], s[28:29], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], s[26:27], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], s[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], s[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], s[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], s[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], s[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], s[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], s[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0
+; GFX11-TRUE16-NEXT: s_branch .LBB53_5
+; GFX11-TRUE16-NEXT: .LBB53_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT: s_branch .LBB53_2
+; GFX11-TRUE16-NEXT: .LBB53_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v35, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s45 :: v_dual_mov_b32 v33, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v31, s11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v27, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s41 :: v_dual_mov_b32 v25, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s40 :: v_dual_mov_b32 v23, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s15 :: v_dual_mov_b32 v21, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s14 :: v_dual_mov_b32 v19, s5
+; GFX11-TRUE16-NEXT: .LBB53_5: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v32 :: v_dual_mov_b32 v31, v31
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v22, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v34 :: v_dual_mov_b32 v35, v35
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v32.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v33 :: v_dual_mov_b32 v27, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v30 :: v_dual_mov_b32 v29, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v28 :: v_dual_mov_b32 v25, v25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v26 :: v_dual_mov_b32 v21, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v24 :: v_dual_mov_b32 v19, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v22, v23
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v20, v20
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v18, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v19.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v9f64_to_v36f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_4
+; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], s[28:29], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], s[26:27], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], s[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], s[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], s[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], s[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], s[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], s[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], s[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0
+; GFX11-FAKE16-NEXT: s_branch .LBB53_5
+; GFX11-FAKE16-NEXT: .LBB53_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT: s_branch .LBB53_2
+; GFX11-FAKE16-NEXT: .LBB53_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v19, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v10, s20
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v8, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v20, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v6, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v4, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v18, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v16, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v14, s29
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v7, s43
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v27, s42
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v21, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v35, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v33, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s15 :: v_dual_mov_b32 v31, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v29, s9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v25, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v23, s5
+; GFX11-FAKE16-NEXT: .LBB53_5: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v34, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v33, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v32, 16, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v26, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v35, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v31, 16, v34
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v29, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v22, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v23, 16, v19
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -27654,149 +28186,285 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v3, v19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v36i16_to_v36f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_lshr_b32 s45, s29, 16
-; GFX11-NEXT: s_lshr_b32 s44, s28, 16
-; GFX11-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-NEXT: s_lshr_b32 s42, s26, 16
-; GFX11-NEXT: s_lshr_b32 s41, s25, 16
-; GFX11-NEXT: s_lshr_b32 s40, s24, 16
-; GFX11-NEXT: s_lshr_b32 s15, s23, 16
-; GFX11-NEXT: s_lshr_b32 s14, s22, 16
-; GFX11-NEXT: s_lshr_b32 s13, s21, 16
-; GFX11-NEXT: s_lshr_b32 s12, s20, 16
-; GFX11-NEXT: s_lshr_b32 s11, s19, 16
-; GFX11-NEXT: s_lshr_b32 s10, s18, 16
-; GFX11-NEXT: s_lshr_b32 s9, s17, 16
-; GFX11-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-NEXT: s_lshr_b32 s6, s3, 16
-; GFX11-NEXT: s_lshr_b32 s8, s2, 16
-; GFX11-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-NEXT: s_lshr_b32 s5, s0, 16
-; GFX11-NEXT: s_mov_b32 s46, 0
-; GFX11-NEXT: s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB57_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
-; GFX11-NEXT: s_cbranch_vccnz .LBB57_4
-; GFX11-NEXT: .LBB57_2: ; %cmp.true
-; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s45
-; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s44
-; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s43
-; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s42
-; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s41
-; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s40
-; GFX11-NEXT: s_pack_ll_b32_b16 s15, s23, s15
-; GFX11-NEXT: s_pack_ll_b32_b16 s14, s22, s14
-; GFX11-NEXT: s_pack_ll_b32_b16 s13, s21, s13
-; GFX11-NEXT: s_pack_ll_b32_b16 s12, s20, s12
-; GFX11-NEXT: s_pack_ll_b32_b16 s11, s19, s11
-; GFX11-NEXT: s_pack_ll_b32_b16 s10, s18, s10
-; GFX11-NEXT: s_pack_ll_b32_b16 s9, s17, s9
-; GFX11-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s6
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s8
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s5
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, s13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, s11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, s9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v0, s2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, s3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v13
-; GFX11-NEXT: s_branch .LBB57_5
-; GFX11-NEXT: .LBB57_3:
-; GFX11-NEXT: s_branch .LBB57_2
-; GFX11-NEXT: .LBB57_4:
-; GFX11-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
-; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
-; GFX11-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v8, s24
-; GFX11-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v10, s22
-; GFX11-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s20
-; GFX11-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v4, s18
-; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
-; GFX11-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
-; GFX11-NEXT: v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
-; GFX11-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
-; GFX11-NEXT: v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
-; GFX11-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
-; GFX11-NEXT: v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
-; GFX11-NEXT: v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
-; GFX11-NEXT: v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
-; GFX11-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
-; GFX11-NEXT: .LBB57_5: ; %end
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v0, v35, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v1, v34, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v2, v33, 16, v36
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v32, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v4, v31, 16, v6
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v6, v29, 16, v33
-; GFX11-NEXT: v_lshl_or_b32 v7, v28, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v8, v27, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v9, v26, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v24, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v12, v23, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v5, v30, 16, v5
-; GFX11-NEXT: v_lshl_or_b32 v10, v25, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v13, v22, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v14, v21, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v15, v20, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v19, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v17, v18, 16, v24
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v36f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
+; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-TRUE16-NEXT: s_branch .LBB57_5
+; GFX11-TRUE16-NEXT: .LBB57_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB57_2
+; GFX11-TRUE16-NEXT: .LBB57_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v19 :: v_dual_mov_b32 v18, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v18.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v36f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16
+; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4
+; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GFX11-FAKE16-NEXT: s_branch .LBB57_5
+; GFX11-FAKE16-NEXT: .LBB57_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB57_2
+; GFX11-FAKE16-NEXT: .LBB57_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v8, s24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s20
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v4, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v0, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
+; GFX11-FAKE16-NEXT: .LBB57_5: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -29137,149 +29805,285 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v3, v19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v36f16_to_v36i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_lshr_b32 s45, s29, 16
-; GFX11-NEXT: s_lshr_b32 s44, s28, 16
-; GFX11-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-NEXT: s_lshr_b32 s42, s26, 16
-; GFX11-NEXT: s_lshr_b32 s41, s25, 16
-; GFX11-NEXT: s_lshr_b32 s40, s24, 16
-; GFX11-NEXT: s_lshr_b32 s15, s23, 16
-; GFX11-NEXT: s_lshr_b32 s14, s22, 16
-; GFX11-NEXT: s_lshr_b32 s13, s21, 16
-; GFX11-NEXT: s_lshr_b32 s12, s20, 16
-; GFX11-NEXT: s_lshr_b32 s11, s19, 16
-; GFX11-NEXT: s_lshr_b32 s10, s18, 16
-; GFX11-NEXT: s_lshr_b32 s9, s17, 16
-; GFX11-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-NEXT: s_lshr_b32 s6, s3, 16
-; GFX11-NEXT: s_lshr_b32 s8, s2, 16
-; GFX11-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-NEXT: s_lshr_b32 s5, s0, 16
-; GFX11-NEXT: s_mov_b32 s46, 0
-; GFX11-NEXT: s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB59_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
-; GFX11-NEXT: s_cbranch_vccnz .LBB59_4
-; GFX11-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s45
-; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s44
-; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s43
-; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s42
-; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s41
-; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s40
-; GFX11-NEXT: s_pack_ll_b32_b16 s15, s23, s15
-; GFX11-NEXT: s_pack_ll_b32_b16 s14, s22, s14
-; GFX11-NEXT: s_pack_ll_b32_b16 s13, s21, s13
-; GFX11-NEXT: s_pack_ll_b32_b16 s12, s20, s12
-; GFX11-NEXT: s_pack_ll_b32_b16 s11, s19, s11
-; GFX11-NEXT: s_pack_ll_b32_b16 s10, s18, s10
-; GFX11-NEXT: s_pack_ll_b32_b16 s9, s17, s9
-; GFX11-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s6
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s8
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s5
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v13
-; GFX11-NEXT: s_branch .LBB59_5
-; GFX11-NEXT: .LBB59_3:
-; GFX11-NEXT: s_branch .LBB59_2
-; GFX11-NEXT: .LBB59_4:
-; GFX11-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
-; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
-; GFX11-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v8, s24
-; GFX11-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v10, s22
-; GFX11-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s20
-; GFX11-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v4, s18
-; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
-; GFX11-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
-; GFX11-NEXT: v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
-; GFX11-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
-; GFX11-NEXT: v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
-; GFX11-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
-; GFX11-NEXT: v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
-; GFX11-NEXT: v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
-; GFX11-NEXT: v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
-; GFX11-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
-; GFX11-NEXT: .LBB59_5: ; %end
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v0, v35, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v1, v34, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v2, v33, 16, v36
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v32, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v4, v31, 16, v6
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v6, v29, 16, v33
-; GFX11-NEXT: v_lshl_or_b32 v7, v28, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v8, v27, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v9, v26, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v24, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v12, v23, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v5, v30, 16, v5
-; GFX11-NEXT: v_lshl_or_b32 v10, v25, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v13, v22, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v14, v21, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v15, v20, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v19, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v17, v18, 16, v24
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v36i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
+; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-TRUE16-NEXT: s_branch .LBB59_5
+; GFX11-TRUE16-NEXT: .LBB59_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB59_2
+; GFX11-TRUE16-NEXT: .LBB59_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v19 :: v_dual_mov_b32 v18, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v18.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v36i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16
+; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4
+; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GFX11-FAKE16-NEXT: s_branch .LBB59_5
+; GFX11-FAKE16-NEXT: .LBB59_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB59_2
+; GFX11-FAKE16-NEXT: .LBB59_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v8, s24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s20
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v4, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v0, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
+; GFX11-FAKE16-NEXT: .LBB59_5: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 47cb6bd..44cfd6c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -4913,93 +4913,270 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3
; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB15_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB15_2
;
; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v20i32_scalar:
@@ -8342,93 +8519,270 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3
; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB19_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB19_2
;
; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v20i32_scalar:
@@ -11100,142 +11454,271 @@ define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr26
; GFX9-NEXT: s_branch .LBB29_2
;
-; GFX11-LABEL: bitcast_v20f32_to_v40i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1
-; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16
-; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17
-; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21
-; GFX11-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25
-; GFX11-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT: .LBB29_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: .LBB29_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v21, v21, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v5, v36, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v6, v35, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v33, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v9, v32, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v10, v31, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v11, v30, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v13, v28, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v14, v27, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v22, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v21
-; GFX11-NEXT: v_lshl_or_b32 v20, v39, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v3, v38, 16, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v37, 16, v48
-; GFX11-NEXT: v_lshl_or_b32 v7, v34, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v12, v29, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v15, v26, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v25, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v24, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v23, 16, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, v20
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB29_4:
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr2
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v20f32_to_v40i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v20.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB29_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-TRUE16-NEXT: s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v20f32_to_v40i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v36, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v32, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v31, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v28, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v27, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v38, 16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v37, 16, v48
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v29, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v24, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v20
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB29_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: s_branch .LBB29_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -12629,93 +13112,270 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3
; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB31_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB31_2
;
; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v20f32_scalar:
@@ -14269,142 +14929,271 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr26
; GFX9-NEXT: s_branch .LBB33_2
;
-; GFX11-LABEL: bitcast_v20f32_to_v40f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1
-; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16
-; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17
-; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21
-; GFX11-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25
-; GFX11-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT: .LBB33_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: .LBB33_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v21, v21, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v5, v36, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v6, v35, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v33, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v9, v32, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v10, v31, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v11, v30, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v13, v28, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v14, v27, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v22, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v21
-; GFX11-NEXT: v_lshl_or_b32 v20, v39, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v3, v38, 16, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v37, 16, v48
-; GFX11-NEXT: v_lshl_or_b32 v7, v34, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v12, v29, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v15, v26, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v25, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v24, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v23, 16, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, v20
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB33_4:
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr2
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v20f32_to_v40f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v20.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB33_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-TRUE16-NEXT: s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v20f32_to_v40f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v36, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v32, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v31, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v28, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v27, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v38, 16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v37, 16, v48
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v29, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v24, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v20
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB33_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: s_branch .LBB33_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -16043,93 +16832,270 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3
; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB35_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB35_2
;
; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v20f32_scalar:
@@ -19655,93 +20621,270 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3
; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB43_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB43_2
;
; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v10i64_scalar:
@@ -23094,93 +24237,270 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3
; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB47_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB47_2
;
; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v10i64_scalar:
@@ -24382,142 +25702,271 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr26
; GFX9-NEXT: s_branch .LBB49_2
;
-; GFX11-LABEL: bitcast_v10f64_to_v40i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1
-; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
-; GFX11-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17
-; GFX11-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
-; GFX11-NEXT: v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21
-; GFX11-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT: .LBB49_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
-; GFX11-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[3:4], v[3:4], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: .LBB49_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v37, 16, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v36, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v21, v38, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v34, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v7, v33, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v32, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v5, v35, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v11, v30, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v13, v28, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v22, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v21
-; GFX11-NEXT: v_lshl_or_b32 v20, v39, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v9, v9, 16, v36
-; GFX11-NEXT: v_lshl_or_b32 v10, v31, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v29, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v14, v27, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v15, v26, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v25, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v24, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v18, v23, 16, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, v20
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB49_4:
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr2
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr9
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v10f64_to_v40i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v20.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB49_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-TRUE16-NEXT: s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], v[3:4], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v36, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v33, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v32, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v35, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v28, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v27, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v25, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v24, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v20
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB49_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: s_branch .LBB49_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -25911,93 +27360,270 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3
; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB51_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB51_2
;
; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v10f64_scalar:
@@ -27484,142 +29110,271 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a
; GFX9-NEXT: ; implicit-def: $vgpr26
; GFX9-NEXT: s_branch .LBB53_2
;
-; GFX11-LABEL: bitcast_v10f64_to_v40f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1
-; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
-; GFX11-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17
-; GFX11-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
-; GFX11-NEXT: v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21
-; GFX11-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT: .LBB53_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
-; GFX11-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[3:4], v[3:4], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: .LBB53_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v37, 16, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v36, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v21, v38, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v34, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v7, v33, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v32, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v5, v35, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v11, v30, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v13, v28, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v22, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v21
-; GFX11-NEXT: v_lshl_or_b32 v20, v39, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v9, v9, 16, v36
-; GFX11-NEXT: v_lshl_or_b32 v10, v31, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v29, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v14, v27, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v15, v26, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v25, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v24, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v18, v23, 16, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, v20
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB53_4:
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr2
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr9
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v10f64_to_v40f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v20.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB53_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-TRUE16-NEXT: s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], v[3:4], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v36, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v33, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v32, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v35, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v28, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v27, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v25, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v24, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v20
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB53_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: s_branch .LBB53_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -29258,93 +31013,270 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3
; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB55_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB55_2
;
; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v10f64_scalar:
@@ -31057,12 +32989,10 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v40f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v19.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
@@ -31083,17 +33013,16 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v20.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40
@@ -31109,59 +33038,61 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s12, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
; GFX11-TRUE16-NEXT: .LBB57_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20
; GFX11-TRUE16-NEXT: s_branch .LBB57_2
; GFX11-TRUE16-NEXT: .LBB57_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v10, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s25 :: v_dual_mov_b32 v12, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s23 :: v_dual_mov_b32 v14, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v6, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v2, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v21, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s43 :: v_dual_mov_b32 v25, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v27, s40
@@ -31172,47 +33103,37 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v37, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v36, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v33, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v31, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v21
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v30, 16, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v29, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v25, 16, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v28, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v27, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v26, 16, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v24, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v23, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v25
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v20 :: v_dual_mov_b32 v1, v21
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v20.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v40f16_scalar:
@@ -32879,12 +34800,10 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v40i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v19.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
@@ -32905,17 +34824,16 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v20.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40
@@ -32931,59 +34849,61 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s12 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
; GFX11-TRUE16-NEXT: .LBB59_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20
; GFX11-TRUE16-NEXT: s_branch .LBB59_2
; GFX11-TRUE16-NEXT: .LBB59_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v10, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s25 :: v_dual_mov_b32 v12, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s23 :: v_dual_mov_b32 v14, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v6, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v2, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v21, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s43 :: v_dual_mov_b32 v25, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v27, s40
@@ -32994,47 +34914,37 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v37, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v36, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v33, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v31, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v21
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v30, 16, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v29, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v25, 16, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v28, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v27, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v26, 16, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v24, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v23, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v25
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v20 :: v_dual_mov_b32 v1, v21
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v20.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v40i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index 11f90b9..14e17ce 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -2411,66 +2411,123 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v4bf16_to_i64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB23_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_vccnz .LBB23_4
-; GFX11-NEXT: .LBB23_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB23_3:
-; GFX11-NEXT: s_branch .LBB23_2
-; GFX11-NEXT: .LBB23_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_i64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-TRUE16-NEXT: .LBB23_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v5 :: v_dual_add_nc_u32 v9, v9, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB23_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB23_2
+; GFX11-TRUE16-NEXT: .LBB23_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_i64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-FAKE16-NEXT: .LBB23_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB23_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB23_2
+; GFX11-FAKE16-NEXT: .LBB23_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -5542,66 +5599,123 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v4bf16_to_f64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB47_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_vccnz .LBB47_4
-; GFX11-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB47_3:
-; GFX11-NEXT: s_branch .LBB47_2
-; GFX11-NEXT: .LBB47_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_f64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_4
+; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v5 :: v_dual_add_nc_u32 v9, v9, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB47_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB47_2
+; GFX11-TRUE16-NEXT: .LBB47_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_f64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_4
+; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB47_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB47_2
+; GFX11-FAKE16-NEXT: .LBB47_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -8386,66 +8500,123 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v4bf16_to_v2i32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB67_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_vccnz .LBB67_4
-; GFX11-NEXT: .LBB67_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB67_3:
-; GFX11-NEXT: s_branch .LBB67_2
-; GFX11-NEXT: .LBB67_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v2i32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB67_4
+; GFX11-TRUE16-NEXT: .LBB67_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v5 :: v_dual_add_nc_u32 v9, v9, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB67_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB67_2
+; GFX11-TRUE16-NEXT: .LBB67_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2i32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB67_4
+; GFX11-FAKE16-NEXT: .LBB67_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB67_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB67_2
+; GFX11-FAKE16-NEXT: .LBB67_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -10905,66 +11076,123 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v4bf16_to_v2f32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB83_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_vccnz .LBB83_4
-; GFX11-NEXT: .LBB83_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB83_3:
-; GFX11-NEXT: s_branch .LBB83_2
-; GFX11-NEXT: .LBB83_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v2f32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB83_4
+; GFX11-TRUE16-NEXT: .LBB83_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v5 :: v_dual_add_nc_u32 v9, v9, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB83_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB83_2
+; GFX11-TRUE16-NEXT: .LBB83_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2f32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB83_4
+; GFX11-FAKE16-NEXT: .LBB83_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB83_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB83_2
+; GFX11-FAKE16-NEXT: .LBB83_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -12814,47 +13042,40 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v4, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v3, v5 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v0, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11-TRUE16-NEXT: .LBB94_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -13071,60 +13292,112 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v4bf16_to_v4i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB95_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_vccnz .LBB95_4
-; GFX11-NEXT: .LBB95_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v3, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB95_3:
-; GFX11-NEXT: s_branch .LBB95_2
-; GFX11-NEXT: .LBB95_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v4i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB95_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB95_4
+; GFX11-TRUE16-NEXT: .LBB95_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB95_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB95_2
+; GFX11-TRUE16-NEXT: .LBB95_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v4i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB95_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB95_4
+; GFX11-FAKE16-NEXT: .LBB95_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v3, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB95_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB95_2
+; GFX11-FAKE16-NEXT: .LBB95_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -14889,65 +15162,124 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v4bf16_to_v4f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB103_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_vccnz .LBB103_4
-; GFX11-NEXT: .LBB103_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v4, v8 :: v_dual_and_b32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB103_3:
-; GFX11-NEXT: s_branch .LBB103_2
-; GFX11-NEXT: .LBB103_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v4f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB103_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB103_4
+; GFX11-TRUE16-NEXT: .LBB103_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB103_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB103_2
+; GFX11-TRUE16-NEXT: .LBB103_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v4f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB103_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB103_4
+; GFX11-FAKE16-NEXT: .LBB103_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v4, v8 :: v_dual_and_b32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB103_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB103_2
+; GFX11-FAKE16-NEXT: .LBB103_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -16614,88 +16946,172 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32
; GFX9-NEXT: v_mov_b32_e32 v4, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v4bf16_to_v8i8_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB109_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b64 s[2:3], s[0:1], 24
-; GFX11-NEXT: s_lshr_b32 s6, s1, 24
-; GFX11-NEXT: s_lshr_b32 s8, s1, 16
-; GFX11-NEXT: s_lshr_b32 s7, s1, 8
-; GFX11-NEXT: s_lshr_b32 s5, s0, 16
-; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB109_4
-; GFX11-NEXT: .LBB109_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, v3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v9, v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v10, v6, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[9:10]
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v10
-; GFX11-NEXT: v_mov_b32_e32 v4, v8
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB109_3:
-; GFX11-NEXT: ; implicit-def: $sgpr3
-; GFX11-NEXT: ; implicit-def: $sgpr5
-; GFX11-NEXT: ; implicit-def: $sgpr2
-; GFX11-NEXT: ; implicit-def: $sgpr7
-; GFX11-NEXT: ; implicit-def: $sgpr8
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: s_branch .LBB109_2
-; GFX11-NEXT: .LBB109_4:
-; GFX11-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s6
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7
-; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v8i8_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s1, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 8
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB109_4
+; GFX11-TRUE16-NEXT: .LBB109_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v3, v7 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v1.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v6.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v10
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v8
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB109_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: s_branch .LBB109_2
+; GFX11-TRUE16-NEXT: .LBB109_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v8i8_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s1, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB109_4
+; GFX11-FAKE16-NEXT: .LBB109_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, v3, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v6, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[9:10]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v10
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v8
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB109_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: s_branch .LBB109_2
+; GFX11-FAKE16-NEXT: .LBB109_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index 2cc7c44..87d5157 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -5328,105 +5328,278 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v22i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3
; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB15_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB15_2
;
; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v22i32_scalar:
@@ -9137,105 +9310,278 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v22i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3
; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB19_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB19_2
;
; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v22i32_scalar:
@@ -12099,155 +12445,295 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr30
; GFX9-NEXT: s_branch .LBB29_2
;
-; GFX11-LABEL: bitcast_v22f32_to_v44i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1
-; GFX11-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3
-; GFX11-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18
-; GFX11-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19
-; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23
-; GFX11-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v14, s25
-; GFX11-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v12, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT: .LBB29_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v13, 1.0, v13
-; GFX11-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT: .LBB29_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v23, v23, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v25, v25, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v7, v48, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v36, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v5, v50, 16, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v49, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v8, v39, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v37, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_lshl_or_b32 v12, v35, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v15, v32, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v19, v28, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v21, v26, 16, v3
-; GFX11-NEXT: v_mov_b32_e32 v1, v25
-; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v22
-; GFX11-NEXT: v_mov_b32_e32 v3, v23
-; GFX11-NEXT: v_lshl_or_b32 v22, v51, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v9, v38, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v13, v34, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v33, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v16, v31, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v30, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v18, v29, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v20, v27, 16, v2
-; GFX11-NEXT: v_mov_b32_e32 v0, v24
-; GFX11-NEXT: v_mov_b32_e32 v2, v22
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB29_4:
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr4
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v22f32_to_v44i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v22.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB29_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT: s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v22f32_to_v44i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v14, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v12, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v13, 1.0, v13
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v36, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v49, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v37, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB29_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: s_branch .LBB29_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -13805,105 +14291,278 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v22f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3
; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB31_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB31_2
;
; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v22f32_scalar:
@@ -15630,155 +16289,295 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr30
; GFX9-NEXT: s_branch .LBB33_2
;
-; GFX11-LABEL: bitcast_v22f32_to_v44f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1
-; GFX11-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3
-; GFX11-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18
-; GFX11-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19
-; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23
-; GFX11-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v14, s25
-; GFX11-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v12, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT: .LBB33_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v13, 1.0, v13
-; GFX11-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT: .LBB33_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v23, v23, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v25, v25, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v7, v48, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v36, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v5, v50, 16, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v49, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v8, v39, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v37, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_lshl_or_b32 v12, v35, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v15, v32, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v19, v28, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v21, v26, 16, v3
-; GFX11-NEXT: v_mov_b32_e32 v1, v25
-; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v22
-; GFX11-NEXT: v_mov_b32_e32 v3, v23
-; GFX11-NEXT: v_lshl_or_b32 v22, v51, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v9, v38, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v13, v34, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v33, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v16, v31, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v30, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v18, v29, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v20, v27, 16, v2
-; GFX11-NEXT: v_mov_b32_e32 v0, v24
-; GFX11-NEXT: v_mov_b32_e32 v2, v22
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB33_4:
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr4
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v22f32_to_v44f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v22.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB33_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT: s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v22f32_to_v44f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v14, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v12, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v13, 1.0, v13
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v36, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v49, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v37, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB33_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: s_branch .LBB33_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -17607,105 +18406,278 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v22f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3
; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB35_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB35_2
;
; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v22f32_scalar:
@@ -21568,105 +22540,278 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v11i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3
; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB43_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB43_2
;
; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v11i64_scalar:
@@ -25389,105 +26534,278 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v11i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3
; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB47_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB47_2
;
; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v11i64_scalar:
@@ -26793,154 +28111,294 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr30
; GFX9-NEXT: s_branch .LBB49_2
;
-; GFX11-LABEL: bitcast_v11f64_to_v44i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
-; GFX11-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3
-; GFX11-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17
-; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19
-; GFX11-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
-; GFX11-NEXT: v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23
-; GFX11-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT: .LBB49_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT: .LBB49_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v25, v25, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v23, v50, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v5
-; GFX11-NEXT: v_lshl_or_b32 v5, v49, 16, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v48, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v7, v39, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v21, v26, 16, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v23
-; GFX11-NEXT: v_lshl_or_b32 v22, v51, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v38, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v9, v37, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v36, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v11, v11, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v12, v35, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v13, v34, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v14, v33, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v15, v32, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v16, v31, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v30, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v18, v29, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v28, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v20, v27, 16, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-NEXT: v_mov_b32_e32 v2, v22
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB49_4:
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr4
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr11
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v11f64_to_v44i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v22.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB49_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT: s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v11f64_to_v44i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v48, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v39, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v36, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v11, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB49_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: s_branch .LBB49_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -28498,105 +29956,278 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v11f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3
; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB51_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB51_2
;
; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v11f64_scalar:
@@ -30248,154 +31879,294 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a
; GFX9-NEXT: ; implicit-def: $vgpr30
; GFX9-NEXT: s_branch .LBB53_2
;
-; GFX11-LABEL: bitcast_v11f64_to_v44f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
-; GFX11-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3
-; GFX11-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17
-; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19
-; GFX11-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
-; GFX11-NEXT: v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23
-; GFX11-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT: .LBB53_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT: .LBB53_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v25, v25, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v23, v50, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v5
-; GFX11-NEXT: v_lshl_or_b32 v5, v49, 16, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v48, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v7, v39, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v21, v26, 16, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v23
-; GFX11-NEXT: v_lshl_or_b32 v22, v51, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v38, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v9, v37, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v36, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v11, v11, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v12, v35, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v13, v34, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v14, v33, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v15, v32, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v16, v31, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v30, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v18, v29, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v28, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v20, v27, 16, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-NEXT: v_mov_b32_e32 v2, v22
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB53_4:
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr4
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr11
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v11f64_to_v44f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v22.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB53_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT: s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v11f64_to_v44f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v48, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v39, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v36, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v11, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB53_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: s_branch .LBB53_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -32224,105 +33995,278 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v11f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3
; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB55_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB55_2
;
; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v11f64_scalar:
@@ -34283,15 +36227,10 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v44f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -34313,19 +36252,18 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v22.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
@@ -34343,63 +36281,67 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s25, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s24, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v6
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
; GFX11-TRUE16-NEXT: .LBB57_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
; GFX11-TRUE16-NEXT: s_branch .LBB57_2
; GFX11-TRUE16-NEXT: .LBB57_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v12, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s27 :: v_dual_mov_b32 v14, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s25 :: v_dual_mov_b32 v16, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s23 :: v_dual_mov_b32 v8, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s19 :: v_dual_mov_b32 v4, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v23, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s1 :: v_dual_mov_b32 v25, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s45 :: v_dual_mov_b32 v27, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s41 :: v_dual_mov_b32 v31, s40
@@ -34410,53 +36352,40 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s5 :: v_dual_mov_b32 v49, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s12
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v51, 16, v25
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v50, 16, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v49, 16, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v48, 16, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v38, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v39, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v37, 16, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v35, 16, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v31, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v30, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v29, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v28, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v27, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v26, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v22.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v44f16_scalar:
@@ -36279,15 +38208,10 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v44i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -36309,19 +38233,18 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v22.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
@@ -36339,63 +38262,67 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s24 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v6
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
; GFX11-TRUE16-NEXT: .LBB59_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
; GFX11-TRUE16-NEXT: s_branch .LBB59_2
; GFX11-TRUE16-NEXT: .LBB59_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v12, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s27 :: v_dual_mov_b32 v14, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s25 :: v_dual_mov_b32 v16, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s23 :: v_dual_mov_b32 v8, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s19 :: v_dual_mov_b32 v4, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v23, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s1 :: v_dual_mov_b32 v25, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s45 :: v_dual_mov_b32 v27, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s41 :: v_dual_mov_b32 v31, s40
@@ -36406,53 +38333,40 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s5 :: v_dual_mov_b32 v49, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s12
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v51, 16, v25
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v50, 16, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v49, 16, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v48, 16, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v38, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v39, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v37, 16, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v35, 16, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v31, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v30, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v29, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v28, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v27, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v26, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v22.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v44i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index c35e183..fb2e94f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -5805,117 +5805,286 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v24i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3
; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB15_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB15_2
;
; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v24i32_scalar:
@@ -10044,117 +10213,286 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v24i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3
; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB19_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB19_2
;
; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v24i32_scalar:
@@ -13212,166 +13550,317 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr34
; GFX9-NEXT: s_branch .LBB29_2
;
-; GFX11-LABEL: bitcast_v24f32_to_v48i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1
-; GFX11-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3
-; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17
-; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20
-; GFX11-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21
-; GFX11-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25
-; GFX11-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v14, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT: .LBB29_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT: .LBB29_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v51, 16, v12
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v25, v25, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v12, v49, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v27, v27, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v14, v39, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v15, v38, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v35, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v26, v26, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v9, v52, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v13, v48, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX11-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v28, v28, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v20, v33, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v25
-; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v7, v54, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v53, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v11, v50, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v16, v37, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v36, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v19, v34, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v21, v32, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v22, v31, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v23, v30, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v24
-; GFX11-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB29_4:
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr6
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v24f32_to_v48i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v24.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB29_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v24f32_to_v48i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v14, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT: .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v51, 16, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v49, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v39, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v38, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v52, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v48, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v54, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v53, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v37, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB29_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: s_branch .LBB29_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -15153,117 +15642,286 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v24f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3
; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB31_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB31_2
;
; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v24f32_scalar:
@@ -17167,166 +17825,317 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr34
; GFX9-NEXT: s_branch .LBB33_2
;
-; GFX11-LABEL: bitcast_v24f32_to_v48f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1
-; GFX11-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3
-; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17
-; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20
-; GFX11-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21
-; GFX11-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25
-; GFX11-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v14, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT: .LBB33_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT: .LBB33_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v51, 16, v12
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v25, v25, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v12, v49, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v27, v27, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v14, v39, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v15, v38, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v35, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v26, v26, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v9, v52, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v13, v48, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX11-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v28, v28, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v20, v33, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v25
-; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v7, v54, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v53, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v11, v50, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v16, v37, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v36, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v19, v34, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v21, v32, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v22, v31, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v23, v30, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v24
-; GFX11-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB33_4:
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr6
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v24f32_to_v48f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v24.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB33_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v24f32_to_v48f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v14, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT: .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v51, 16, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v49, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v39, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v38, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v52, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v48, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v54, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v53, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v37, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB33_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: s_branch .LBB33_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -19382,117 +20191,286 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v24f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3
; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB35_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB35_2
;
; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v24f32_scalar:
@@ -23764,117 +24742,286 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v12i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3
; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB43_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB43_2
;
; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v12i64_scalar:
@@ -28015,117 +29162,286 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v12i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3
; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB47_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB47_2
;
; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v12i64_scalar:
@@ -29551,166 +30867,317 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr34
; GFX9-NEXT: s_branch .LBB49_2
;
-; GFX11-LABEL: bitcast_v12f64_to_v48i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1
-; GFX11-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
-; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
-; GFX11-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
-; GFX11-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21
-; GFX11-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
-; GFX11-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT: .LBB49_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT: .LBB49_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v7, v53, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v52, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v25, v54, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v27, v27, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v9, v51, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v35, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX11-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v28, v28, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v26, v26, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v13, v13, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshl_or_b32 v20, v33, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v25
-; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v10, v50, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v49, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v48, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v14, v39, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v15, v38, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v37, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v36, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v19, v34, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v21, v32, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v22, v31, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v23, v30, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v24
-; GFX11-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB49_4:
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr6
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr13
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v12f64_to_v48i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v24.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB49_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v12f64_to_v48i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT: .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v52, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v51, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v49, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v39, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v38, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v37, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v36, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB49_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: s_branch .LBB49_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -31492,117 +32959,286 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v12f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3
; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB51_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB51_2
;
; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v12f64_scalar:
@@ -33424,166 +35060,317 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
; GFX9-NEXT: ; implicit-def: $vgpr34
; GFX9-NEXT: s_branch .LBB53_2
;
-; GFX11-LABEL: bitcast_v12f64_to_v48f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1
-; GFX11-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
-; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
-; GFX11-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
-; GFX11-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21
-; GFX11-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
-; GFX11-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT: .LBB53_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT: .LBB53_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v7, v53, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v52, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v25, v54, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v27, v27, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v9, v51, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v35, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX11-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v28, v28, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v26, v26, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v13, v13, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshl_or_b32 v20, v33, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v25
-; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v10, v50, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v49, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v48, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v14, v39, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v15, v38, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v37, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v36, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v19, v34, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v21, v32, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v22, v31, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v23, v30, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v24
-; GFX11-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB53_4:
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr6
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr13
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v12f64_to_v48f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v24.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB53_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v12f64_to_v48f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT: .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v52, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v51, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v49, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v39, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v38, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v37, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v36, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB53_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: s_branch .LBB53_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -35639,117 +37426,286 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v12f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3
; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB55_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB55_2
;
; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v12f64_scalar:
@@ -37964,19 +39920,11 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v48f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v23.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -37998,22 +39946,21 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v24.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
@@ -38032,67 +39979,73 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s29, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v8
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
; GFX11-TRUE16-NEXT: .LBB57_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
; GFX11-TRUE16-NEXT: s_branch .LBB57_2
; GFX11-TRUE16-NEXT: .LBB57_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s29 :: v_dual_mov_b32 v15, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v17, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v10, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s21 :: v_dual_mov_b32 v6, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s17 :: v_dual_mov_b32 v29, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s3 :: v_dual_mov_b32 v25, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s1 :: v_dual_mov_b32 v27, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v31, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s43 :: v_dual_mov_b32 v33, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s41 :: v_dual_mov_b32 v35, s40
@@ -38103,58 +40056,43 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s5 :: v_dual_mov_b32 v53, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s14
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v52, 16, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v54, 16, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v53, 16, v64
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v49, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v48, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v51, 16, v29
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v50, 16, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v37, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v33, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v32, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v31, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v30, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v28 :: v_dual_mov_b32 v5, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v24.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v48f16_scalar:
@@ -40168,19 +42106,11 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v48i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v23.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -40202,22 +42132,21 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v24.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
@@ -40236,67 +42165,73 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s29 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v8
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
; GFX11-TRUE16-NEXT: .LBB59_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
; GFX11-TRUE16-NEXT: s_branch .LBB59_2
; GFX11-TRUE16-NEXT: .LBB59_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s29 :: v_dual_mov_b32 v15, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v17, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v10, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s21 :: v_dual_mov_b32 v6, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s17 :: v_dual_mov_b32 v29, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s3 :: v_dual_mov_b32 v25, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s1 :: v_dual_mov_b32 v27, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v31, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s43 :: v_dual_mov_b32 v33, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s41 :: v_dual_mov_b32 v35, s40
@@ -40307,58 +42242,43 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s5 :: v_dual_mov_b32 v53, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s14
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v52, 16, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v54, 16, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v53, 16, v64
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v49, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v48, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v51, 16, v29
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v50, 16, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v37, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v33, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v32, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v31, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v30, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v28 :: v_dual_mov_b32 v5, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v24.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v48i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index 29005a4..07cdbef 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -6286,129 +6286,295 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v26i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3
; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB15_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v53, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB15_2
;
; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v26i32_scalar:
@@ -10946,129 +11112,295 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v26i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3
; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB19_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v53, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB19_2
;
; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v26i32_scalar:
@@ -14389,178 +14721,340 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr38
; GFX9-NEXT: s_branch .LBB29_2
;
-; GFX11-LABEL: bitcast_v26f32_to_v52i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1
-; GFX11-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3
-; GFX11-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17
-; GFX11-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19
-; GFX11-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22
-; GFX11-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23
-; GFX11-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT: .LBB29_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: .LBB29_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v8, v8, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_lshl_or_b32 v27, v27, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v11, v64, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v12, v55, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v53, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v15, v52, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v19, v48, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v16, v51, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v50, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v49, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v6
-; GFX11-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v20, v39, 16, v2
-; GFX11-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v28, v28, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v22, v37, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v33
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v26, v67, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v9, v66, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v65, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v13, v54, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v21, v38, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v23, v36, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v24, v35, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v25, v34, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v32
-; GFX11-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-NEXT: v_mov_b32_e32 v6, v30
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB29_4:
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr8
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v26f32_to_v52i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v26.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB29_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v26f32_to_v52i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v8, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v64, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v53, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v52, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v65, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB29_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: s_branch .LBB29_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -16527,129 +17021,295 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v26f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3
; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB31_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v53, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB31_2
;
; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v26f32_scalar:
@@ -18769,178 +19429,340 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr38
; GFX9-NEXT: s_branch .LBB33_2
;
-; GFX11-LABEL: bitcast_v26f32_to_v52f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1
-; GFX11-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3
-; GFX11-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17
-; GFX11-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19
-; GFX11-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22
-; GFX11-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23
-; GFX11-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT: .LBB33_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: .LBB33_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v8, v8, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_lshl_or_b32 v27, v27, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v11, v64, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v12, v55, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v53, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v15, v52, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v19, v48, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v16, v51, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v50, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v49, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v6
-; GFX11-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v20, v39, 16, v2
-; GFX11-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v28, v28, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v22, v37, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v33
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v26, v67, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v9, v66, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v65, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v13, v54, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v21, v38, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v23, v36, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v24, v35, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v25, v34, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v32
-; GFX11-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-NEXT: v_mov_b32_e32 v6, v30
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB33_4:
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr8
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v26f32_to_v52f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v26.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB33_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v26f32_to_v52f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v8, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v64, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v53, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v52, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v65, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB33_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: s_branch .LBB33_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -21183,129 +22005,295 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v26f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3
; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB35_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v53, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB35_2
;
; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v26f32_scalar:
@@ -25980,129 +26968,295 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v13i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3
; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB43_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v53, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB43_2
;
; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v13i64_scalar:
@@ -30655,129 +31809,295 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v13i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3
; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB47_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v53, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB47_2
;
; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v13i64_scalar:
@@ -32378,178 +33698,340 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr25
; GFX9-NEXT: s_branch .LBB49_2
;
-; GFX11-LABEL: bitcast_v13f64_to_v52i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1
-; GFX11-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3
-; GFX11-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17
-; GFX11-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19
-; GFX11-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
-; GFX11-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
-; GFX11-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25
-; GFX11-NEXT: v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT: .LBB49_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: .LBB49_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v8, v8, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v9, v65, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v64, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_lshl_or_b32 v27, v66, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v54, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v13, v53, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v52, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v19, v48, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v11, v55, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v15, v15, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshl_or_b32 v18, v49, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v20, v39, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v6
-; GFX11-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v28, v28, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v22, v37, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v33
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v26, v67, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v16, v51, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v50, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v21, v38, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v23, v36, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v24, v35, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v25, v34, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v32
-; GFX11-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB49_4:
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr8
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr15
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v13f64_to_v52i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v26.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB49_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v13f64_to_v52i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v64, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v66, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v52, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v55, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v15, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB49_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: s_branch .LBB49_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -34516,129 +35998,295 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v13f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3
; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB51_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v53, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB51_2
;
; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v13f64_scalar:
@@ -36667,178 +38315,340 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; GFX9-NEXT: ; implicit-def: $vgpr25
; GFX9-NEXT: s_branch .LBB53_2
;
-; GFX11-LABEL: bitcast_v13f64_to_v52f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1
-; GFX11-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3
-; GFX11-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17
-; GFX11-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19
-; GFX11-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
-; GFX11-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
-; GFX11-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25
-; GFX11-NEXT: v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT: .LBB53_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: .LBB53_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v8, v8, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v9, v65, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v64, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_lshl_or_b32 v27, v66, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v54, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v13, v53, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v52, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v19, v48, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v11, v55, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v15, v15, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshl_or_b32 v18, v49, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v20, v39, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v6
-; GFX11-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v28, v28, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v22, v37, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v33
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v26, v67, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v16, v51, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v50, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v21, v38, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v23, v36, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v24, v35, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v25, v34, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v32
-; GFX11-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB53_4:
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr8
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr15
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v13f64_to_v52f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v26.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB53_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v13f64_to_v52f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v64, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v66, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v52, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v55, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v15, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB53_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: s_branch .LBB53_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -39081,129 +40891,295 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v13f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3
; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB55_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v53, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB55_2
;
; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v13f64_scalar:
@@ -41806,23 +43782,12 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v52f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -41844,26 +43809,25 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v26.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
@@ -41882,71 +43846,79 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s29, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v33, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v33
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
; GFX11-TRUE16-NEXT: .LBB57_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
; GFX11-TRUE16-NEXT: s_branch .LBB57_2
; GFX11-TRUE16-NEXT: .LBB57_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s29 :: v_dual_mov_b32 v17, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s27 :: v_dual_mov_b32 v12, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s23 :: v_dual_mov_b32 v8, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s19 :: v_dual_mov_b32 v31, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s17 :: v_dual_mov_b32 v27, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v29, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v33, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s43 :: v_dual_mov_b32 v37, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s40 :: v_dual_mov_b32 v39, s15
@@ -41957,62 +43929,46 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s5 :: v_dual_mov_b32 v65, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s41
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v67, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v65, 16, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v54, 16, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v64, 16, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v55, 16, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v53, 16, v31
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v50, 16, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v52, 16, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v66, 16, v68
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v48, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v51, 16, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v49, 16, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v39, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v35, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v34, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v37, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v26.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v52f16_scalar:
@@ -44258,23 +46214,12 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v52i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -44296,26 +46241,25 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v26.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
@@ -44334,71 +46278,79 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s29 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v33, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v33
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
; GFX11-TRUE16-NEXT: .LBB59_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
; GFX11-TRUE16-NEXT: s_branch .LBB59_2
; GFX11-TRUE16-NEXT: .LBB59_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s29 :: v_dual_mov_b32 v17, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s27 :: v_dual_mov_b32 v12, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s23 :: v_dual_mov_b32 v8, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s19 :: v_dual_mov_b32 v31, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s17 :: v_dual_mov_b32 v27, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v29, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v33, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s43 :: v_dual_mov_b32 v37, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s40 :: v_dual_mov_b32 v39, s15
@@ -44409,62 +46361,46 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s5 :: v_dual_mov_b32 v65, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s41
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v67, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v65, 16, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v54, 16, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v64, 16, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v55, 16, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v53, 16, v31
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v50, 16, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v52, 16, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v66, 16, v68
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v48, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v51, 16, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v49, 16, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v39, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v35, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v34, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v37, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v26.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v52i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 8ee5b96..8eb71e9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -6779,141 +6779,299 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3
; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB15_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB15_2
;
; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28i32_scalar:
@@ -11885,141 +12043,299 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3
; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB19_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB19_2
;
; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28i32_scalar:
@@ -15595,191 +15911,364 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: s_branch .LBB29_2
;
-; GFX11-LABEL: bitcast_v28f32_to_v56i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1
-; GFX11-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3
-; GFX11-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17
-; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19
-; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21
-; GFX11-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24
-; GFX11-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v15, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v28
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT: .LBB29_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v28
-; GFX11-NEXT: .LBB29_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v37, v37, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v10, v10, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v16, v65, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v64, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v19, v54, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v11, v70, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v69, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v13, v68, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v15, v66, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v55, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v21, v52, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v8
-; GFX11-NEXT: v_mov_b32_e32 v5, v35
-; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v24, v49, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v31
-; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v28, v71, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v20, v53, 16, v2
-; GFX11-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v36, v36, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v22, v51, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v14, v67, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v23, v50, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v25, v48, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v26, v39, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v27, v38, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v30
-; GFX11-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33
-; GFX11-NEXT: v_mov_b32_e32 v4, v34
-; GFX11-NEXT: v_mov_b32_e32 v6, v36
-; GFX11-NEXT: v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB29_4:
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr10
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v28.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB29_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v15, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT: .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v35
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v30
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB29_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: s_branch .LBB29_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -17915,141 +18404,299 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3
; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB31_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB31_2
;
; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28f32_scalar:
@@ -20379,191 +21026,364 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: s_branch .LBB33_2
;
-; GFX11-LABEL: bitcast_v28f32_to_v56f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1
-; GFX11-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3
-; GFX11-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17
-; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19
-; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21
-; GFX11-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24
-; GFX11-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v15, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v28
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT: .LBB33_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v28
-; GFX11-NEXT: .LBB33_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v37, v37, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v10, v10, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v16, v65, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v64, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v19, v54, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v11, v70, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v69, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v13, v68, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v15, v66, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v55, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v21, v52, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v8
-; GFX11-NEXT: v_mov_b32_e32 v5, v35
-; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v24, v49, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v31
-; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v28, v71, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v20, v53, 16, v2
-; GFX11-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v36, v36, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v22, v51, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v14, v67, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v23, v50, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v25, v48, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v26, v39, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v27, v38, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v30
-; GFX11-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33
-; GFX11-NEXT: v_mov_b32_e32 v4, v34
-; GFX11-NEXT: v_mov_b32_e32 v6, v36
-; GFX11-NEXT: v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB33_4:
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr10
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v28.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB33_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v15, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT: .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v35
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v30
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB33_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: s_branch .LBB33_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -23006,141 +23826,299 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3
; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB35_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB35_2
;
; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28f32_scalar:
@@ -28216,141 +29194,299 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3
; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB43_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB43_2
;
; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14i64_scalar:
@@ -33336,141 +34472,299 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3
; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB47_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB47_2
;
; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14i64_scalar:
@@ -35225,191 +36519,364 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr27
; GFX9-NEXT: s_branch .LBB49_2
;
-; GFX11-LABEL: bitcast_v14f64_to_v56i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-NEXT: v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1
-; GFX11-NEXT: v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3
-; GFX11-NEXT: v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17
-; GFX11-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19
-; GFX11-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21
-; GFX11-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
-; GFX11-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25
-; GFX11-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27
-; GFX11-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v27
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT: .LBB49_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[15:16], v[15:16], 1.0
-; GFX11-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
-; GFX11-NEXT: v_add_f64 v[17:18], v[17:18], 1.0
-; GFX11-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT: v_add_f64 v[19:20], v[19:20], 1.0
-; GFX11-NEXT: v_add_f64 v[21:22], v[21:22], 1.0
-; GFX11-NEXT: v_add_f64 v[23:24], v[23:24], 1.0
-; GFX11-NEXT: v_add_f64 v[25:26], v[25:26], 1.0
-; GFX11-NEXT: v_add_f64 v[27:28], v[27:28], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v27
-; GFX11-NEXT: .LBB49_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v37, v37, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v28, v71, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v36, v36, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v25
-; GFX11-NEXT: v_lshl_or_b32 v10, v10, 16, v11
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v16
-; GFX11-NEXT: v_lshl_or_b32 v15, v66, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v19, v54, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v22, v51, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v9
-; GFX11-NEXT: v_mov_b32_e32 v6, v36
-; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v26
-; GFX11-NEXT: v_mov_b32_e32 v9, v29
-; GFX11-NEXT: v_lshl_or_b32 v11, v70, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v69, 16, v17
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v13, v68, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v64, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v21, v52, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v27, v38, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v34
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v14, v67, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v16, v65, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v18, v55, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v20, v53, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v26, v39, 16, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v33
-; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v23, v50, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v24, v49, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v25, v48, 16, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31
-; GFX11-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35
-; GFX11-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB49_4:
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr10
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v28.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB49_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[23:24], v[23:24], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[25:26], v[25:26], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[27:28], v[27:28], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27
+; GFX11-FAKE16-NEXT: .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v29
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB49_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: s_branch .LBB49_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -37545,141 +39012,299 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3
; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB51_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB51_2
;
; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14f64_scalar:
@@ -39918,191 +41543,364 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; GFX9-NEXT: ; implicit-def: $vgpr27
; GFX9-NEXT: s_branch .LBB53_2
;
-; GFX11-LABEL: bitcast_v14f64_to_v56f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-NEXT: v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1
-; GFX11-NEXT: v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3
-; GFX11-NEXT: v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17
-; GFX11-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19
-; GFX11-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21
-; GFX11-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
-; GFX11-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25
-; GFX11-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27
-; GFX11-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v27
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT: .LBB53_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[15:16], v[15:16], 1.0
-; GFX11-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
-; GFX11-NEXT: v_add_f64 v[17:18], v[17:18], 1.0
-; GFX11-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT: v_add_f64 v[19:20], v[19:20], 1.0
-; GFX11-NEXT: v_add_f64 v[21:22], v[21:22], 1.0
-; GFX11-NEXT: v_add_f64 v[23:24], v[23:24], 1.0
-; GFX11-NEXT: v_add_f64 v[25:26], v[25:26], 1.0
-; GFX11-NEXT: v_add_f64 v[27:28], v[27:28], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v27
-; GFX11-NEXT: .LBB53_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v37, v37, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v28, v71, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v36, v36, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v25
-; GFX11-NEXT: v_lshl_or_b32 v10, v10, 16, v11
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v16
-; GFX11-NEXT: v_lshl_or_b32 v15, v66, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v19, v54, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v22, v51, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v9
-; GFX11-NEXT: v_mov_b32_e32 v6, v36
-; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v26
-; GFX11-NEXT: v_mov_b32_e32 v9, v29
-; GFX11-NEXT: v_lshl_or_b32 v11, v70, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v69, 16, v17
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v13, v68, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v64, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v21, v52, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v27, v38, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v34
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v14, v67, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v16, v65, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v18, v55, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v20, v53, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v26, v39, 16, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v33
-; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v23, v50, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v24, v49, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v25, v48, 16, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31
-; GFX11-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35
-; GFX11-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB53_4:
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr10
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v28.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB53_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[23:24], v[23:24], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[25:26], v[25:26], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[27:28], v[27:28], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27
+; GFX11-FAKE16-NEXT: .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v29
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB53_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: s_branch .LBB53_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -42545,141 +44343,299 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3
; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB55_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB55_2
;
; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14f64_scalar:
@@ -45566,27 +47522,13 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v56f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v27.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
@@ -45608,30 +47550,29 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v27.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v28.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v27, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v26, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42
@@ -45650,75 +47591,85 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v33, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v34, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v37, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v36, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v37
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v36
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v35
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v34
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v33
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
; GFX11-TRUE16-NEXT: .LBB57_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
; GFX11-TRUE16-NEXT: s_branch .LBB57_2
; GFX11-TRUE16-NEXT: .LBB57_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v10, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s21 :: v_dual_mov_b32 v29, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, s19 :: v_dual_mov_b32 v34, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, s17 :: v_dual_mov_b32 v36, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v30, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, s1 :: v_dual_mov_b32 v32, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s45 :: v_dual_mov_b32 v39, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s42 :: v_dual_mov_b32 v49, s41
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s40 :: v_dual_mov_b32 v51, s15
@@ -45729,69 +47680,49 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, s5 :: v_dual_mov_b32 v69, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s43
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v70, 16, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v68, 16, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v66, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v31
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v71, 16, v32
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v69, 16, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v64, 16, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v67, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v65, 16, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v55, 16, v29
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v54, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v53, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v52, 16, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v51, 16, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v50, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v49, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v48, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v39, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v38, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v4
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v30
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v32
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_mov_b32 v7, v37
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v28.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v56f16_scalar:
@@ -48280,27 +50211,13 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v56i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v27.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
@@ -48322,30 +50239,29 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v27.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v28.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v27, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v26, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42
@@ -48364,75 +50280,85 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v33, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v34, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v37, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v36, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v37
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v36
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v35
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v34
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v33
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
; GFX11-TRUE16-NEXT: .LBB59_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
; GFX11-TRUE16-NEXT: s_branch .LBB59_2
; GFX11-TRUE16-NEXT: .LBB59_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v10, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s21 :: v_dual_mov_b32 v29, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, s19 :: v_dual_mov_b32 v34, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, s17 :: v_dual_mov_b32 v36, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v30, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, s1 :: v_dual_mov_b32 v32, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s45 :: v_dual_mov_b32 v39, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s42 :: v_dual_mov_b32 v49, s41
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s40 :: v_dual_mov_b32 v51, s15
@@ -48443,69 +50369,49 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, s5 :: v_dual_mov_b32 v69, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s43
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v70, 16, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v68, 16, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v66, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v31
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v71, 16, v32
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v69, 16, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v64, 16, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v67, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v65, 16, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v55, 16, v29
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v54, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v53, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v52, 16, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v51, 16, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v50, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v49, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v48, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v39, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v38, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v4
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v30
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v32
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_mov_b32 v7, v37
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v28.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v56i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index 967f1a9..93c11f1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -7240,153 +7240,305 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3
; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB15_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB15_2
;
; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30i32_scalar:
@@ -12840,153 +12992,305 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3
; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB19_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB19_2
;
; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30i32_scalar:
@@ -16802,204 +17106,388 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: s_branch .LBB29_2
;
-; GFX11-LABEL: bitcast_v30f32_to_v60i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1
-; GFX11-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3
-; GFX11-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17
-; GFX11-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19
-; GFX11-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21
-; GFX11-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23
-; GFX11-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26
-; GFX11-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT: .LBB29_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
-; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: .LBB29_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v12, v12, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v68, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v37, v37, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v39, v39, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v49, v49, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v13, v82, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v81, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v16, v71, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v70, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v69, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v21, v66, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v38, v38, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_lshl_or_b32 v20, v67, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v22, v65, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v23, v64, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v10
-; GFX11-NEXT: v_mov_b32_e32 v5, v49
-; GFX11-NEXT: v_lshl_or_b32 v48, v48, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v26, v53, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v37
-; GFX11-NEXT: v_lshl_or_b32 v36, v36, 16, v30
-; GFX11-NEXT: v_mov_b32_e32 v7, v31
-; GFX11-NEXT: v_lshl_or_b32 v30, v83, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v15, v80, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v25, v54, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v27, v52, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v28, v51, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v29, v50, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v36
-; GFX11-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
-; GFX11-NEXT: v_mov_b32_e32 v4, v48
-; GFX11-NEXT: v_mov_b32_e32 v6, v30
-; GFX11-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
-; GFX11-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB29_4:
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr83
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr12
-; GFX11-NEXT: ; implicit-def: $vgpr82
-; GFX11-NEXT: ; implicit-def: $vgpr81
-; GFX11-NEXT: ; implicit-def: $vgpr80
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v30.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB29_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT: .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v39, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v49, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v82, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v81, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v71, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v70, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v49
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v48
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB29_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: s_branch .LBB29_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -19290,153 +19778,305 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3
; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB31_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB31_2
;
; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30f32_scalar:
@@ -21985,204 +22625,388 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: s_branch .LBB33_2
;
-; GFX11-LABEL: bitcast_v30f32_to_v60f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1
-; GFX11-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3
-; GFX11-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17
-; GFX11-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19
-; GFX11-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21
-; GFX11-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23
-; GFX11-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26
-; GFX11-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT: .LBB33_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
-; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: .LBB33_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v12, v12, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v68, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v37, v37, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v39, v39, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v49, v49, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v13, v82, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v81, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v16, v71, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v70, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v69, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v21, v66, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v38, v38, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_lshl_or_b32 v20, v67, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v22, v65, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v23, v64, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v10
-; GFX11-NEXT: v_mov_b32_e32 v5, v49
-; GFX11-NEXT: v_lshl_or_b32 v48, v48, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v26, v53, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v37
-; GFX11-NEXT: v_lshl_or_b32 v36, v36, 16, v30
-; GFX11-NEXT: v_mov_b32_e32 v7, v31
-; GFX11-NEXT: v_lshl_or_b32 v30, v83, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v15, v80, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v25, v54, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v27, v52, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v28, v51, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v29, v50, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v36
-; GFX11-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
-; GFX11-NEXT: v_mov_b32_e32 v4, v48
-; GFX11-NEXT: v_mov_b32_e32 v6, v30
-; GFX11-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
-; GFX11-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB33_4:
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr83
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr12
-; GFX11-NEXT: ; implicit-def: $vgpr82
-; GFX11-NEXT: ; implicit-def: $vgpr81
-; GFX11-NEXT: ; implicit-def: $vgpr80
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v30.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB33_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT: .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v39, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v49, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v82, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v81, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v71, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v70, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v49
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v48
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB33_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: s_branch .LBB33_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -24867,153 +25691,305 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3
; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB35_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB35_2
;
; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30f32_scalar:
@@ -30472,153 +31448,305 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3
; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB43_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB43_2
;
; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15i64_scalar:
@@ -36089,153 +37217,305 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3
; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB47_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB47_2
;
; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15i64_scalar:
@@ -38144,204 +39424,388 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: s_branch .LBB49_2
;
-; GFX11-LABEL: bitcast_v15f64_to_v60i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v31, s1
-; GFX11-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v29, s3
-; GFX11-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v27, s17
-; GFX11-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19
-; GFX11-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v23, s21
-; GFX11-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23
-; GFX11-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT: .LBB49_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
-; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
-; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: .LBB49_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_lshl_or_b32 v48, v48, 16, v26
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v13, v13, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v68, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v37, v37, 16, v31
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v39, v39, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT: v_lshl_or_b32 v31, v82, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v12, v12, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v69, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v21, v66, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v38, v38, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshl_or_b32 v20, v67, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v22, v65, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v23, v64, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v10
-; GFX11-NEXT: v_mov_b32_e32 v7, v31
-; GFX11-NEXT: v_lshl_or_b32 v49, v49, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v26, v53, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v37
-; GFX11-NEXT: v_lshl_or_b32 v36, v36, 16, v30
-; GFX11-NEXT: v_mov_b32_e32 v9, v33
-; GFX11-NEXT: v_lshl_or_b32 v30, v83, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v14, v81, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v15, v80, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v71, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v70, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v25, v54, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v27, v52, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v28, v51, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v29, v50, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v36
-; GFX11-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
-; GFX11-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49
-; GFX11-NEXT: v_mov_b32_e32 v6, v30
-; GFX11-NEXT: v_mov_b32_e32 v8, v32
-; GFX11-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB49_4:
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr83
-; GFX11-NEXT: ; implicit-def: $vgpr82
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr12
-; GFX11-NEXT: ; implicit-def: $vgpr13
-; GFX11-NEXT: ; implicit-def: $vgpr81
-; GFX11-NEXT: ; implicit-def: $vgpr80
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v30.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB49_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v31, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v29, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v27, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v23, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[30:31], v[30:31], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT: .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v31
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v39, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v82, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v49, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v81, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v71, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v70, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB49_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: s_branch .LBB49_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -40632,153 +42096,305 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3
; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB51_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB51_2
;
; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15f64_scalar:
@@ -43227,204 +44843,388 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: s_branch .LBB53_2
;
-; GFX11-LABEL: bitcast_v15f64_to_v60f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v31, s1
-; GFX11-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v29, s3
-; GFX11-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v27, s17
-; GFX11-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19
-; GFX11-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v23, s21
-; GFX11-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23
-; GFX11-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT: .LBB53_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
-; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
-; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: .LBB53_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_lshl_or_b32 v48, v48, 16, v26
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v13, v13, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v68, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v37, v37, 16, v31
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v39, v39, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT: v_lshl_or_b32 v31, v82, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v12, v12, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v69, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v21, v66, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v38, v38, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshl_or_b32 v20, v67, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v22, v65, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v23, v64, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v10
-; GFX11-NEXT: v_mov_b32_e32 v7, v31
-; GFX11-NEXT: v_lshl_or_b32 v49, v49, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v26, v53, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v37
-; GFX11-NEXT: v_lshl_or_b32 v36, v36, 16, v30
-; GFX11-NEXT: v_mov_b32_e32 v9, v33
-; GFX11-NEXT: v_lshl_or_b32 v30, v83, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v14, v81, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v15, v80, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v71, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v70, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v25, v54, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v27, v52, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v28, v51, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v29, v50, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v36
-; GFX11-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
-; GFX11-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49
-; GFX11-NEXT: v_mov_b32_e32 v6, v30
-; GFX11-NEXT: v_mov_b32_e32 v8, v32
-; GFX11-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB53_4:
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr83
-; GFX11-NEXT: ; implicit-def: $vgpr82
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr12
-; GFX11-NEXT: ; implicit-def: $vgpr13
-; GFX11-NEXT: ; implicit-def: $vgpr81
-; GFX11-NEXT: ; implicit-def: $vgpr80
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v30.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB53_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v31, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v29, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v27, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v23, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[30:31], v[30:31], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT: .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v31
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v39, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v82, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v49, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v81, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v71, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v70, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB53_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: s_branch .LBB53_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -46109,153 +47909,305 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3
; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB55_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB55_2
;
; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15f64_scalar:
@@ -49421,31 +51373,14 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v60f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
@@ -49467,34 +51402,33 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v27.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v30.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v29, 16, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v28, 16, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v27, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v26, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42
@@ -49513,79 +51447,91 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v34, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v33, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v49, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v48, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v39, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v38, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v37, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v36, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v48
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v39
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v38
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v37
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v36
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v49
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v33
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v35
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v34
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
; GFX11-TRUE16-NEXT: .LBB57_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
; GFX11-TRUE16-NEXT: s_branch .LBB57_2
; GFX11-TRUE16-NEXT: .LBB57_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v33, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, s17 :: v_dual_mov_b32 v36, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v38, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, s1 :: v_dual_mov_b32 v48, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s44 :: v_dual_mov_b32 v51, s43
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s42 :: v_dual_mov_b32 v53, s41
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s40 :: v_dual_mov_b32 v55, s15
@@ -49596,75 +51542,52 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s5 :: v_dual_mov_b32 v81, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s45
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v49, v70, 16, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v82, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v39, v80, 16, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v38, v81, 16, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v2
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v83, 16, v48
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v48, v71, 16, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v69, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v68, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v66, 16, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v67, 16, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v65, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v64, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v53, 16, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v52, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v50, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v4
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v36
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v30.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v60f16_scalar:
@@ -52368,31 +54291,14 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v60i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
@@ -52414,34 +54320,33 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v27.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v30.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v29, 16, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v28, 16, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v27, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v26, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42
@@ -52460,79 +54365,91 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v34, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v33, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v49, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v48, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v39, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v38, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v37, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v36, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v48
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v39
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v38
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v37
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v36
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v49
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v33
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v35
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v34
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
; GFX11-TRUE16-NEXT: .LBB59_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
; GFX11-TRUE16-NEXT: s_branch .LBB59_2
; GFX11-TRUE16-NEXT: .LBB59_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v33, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, s17 :: v_dual_mov_b32 v36, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v38, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, s1 :: v_dual_mov_b32 v48, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s44 :: v_dual_mov_b32 v51, s43
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s42 :: v_dual_mov_b32 v53, s41
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s40 :: v_dual_mov_b32 v55, s15
@@ -52543,75 +54460,52 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s5 :: v_dual_mov_b32 v81, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s45
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v49, v70, 16, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v82, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v39, v80, 16, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v38, v81, 16, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v2
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v83, 16, v48
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v48, v71, 16, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v69, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v68, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v66, 16, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v67, 16, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v65, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v64, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v53, 16, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v52, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v50, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v4
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v36
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v30.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v60i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index 9a6ea1b..6ada0cb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -2402,89 +2402,171 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v6bf16_to_v3i32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s3, 0
-; GFX11-NEXT: s_mov_b32 s3, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB11_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
-; GFX11-NEXT: s_cbranch_vccnz .LBB11_4
-; GFX11-NEXT: .LBB11_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s2
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v6
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v9
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v2, v0, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v4, 16, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB11_3:
-; GFX11-NEXT: s_branch .LBB11_2
-; GFX11-NEXT: .LBB11_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v3i32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB11_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB11_4
+; GFX11-TRUE16-NEXT: .LBB11_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v7, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB11_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB11_2
+; GFX11-TRUE16-NEXT: .LBB11_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v3i32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB11_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB11_4
+; GFX11-FAKE16-NEXT: .LBB11_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v6
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v4, 16, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB11_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB11_2
+; GFX11-FAKE16-NEXT: .LBB11_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -5536,89 +5618,171 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v6bf16_to_v3f32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s3, 0
-; GFX11-NEXT: s_mov_b32 s3, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB27_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
-; GFX11-NEXT: s_cbranch_vccnz .LBB27_4
-; GFX11-NEXT: .LBB27_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s2
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v6
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v9
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v2, v0, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v4, 16, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB27_3:
-; GFX11-NEXT: s_branch .LBB27_2
-; GFX11-NEXT: .LBB27_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v3f32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB27_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB27_4
+; GFX11-TRUE16-NEXT: .LBB27_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v7, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB27_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB27_2
+; GFX11-TRUE16-NEXT: .LBB27_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v3f32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB27_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB27_4
+; GFX11-FAKE16-NEXT: .LBB27_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v6
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v4, 16, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB27_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB27_2
+; GFX11-FAKE16-NEXT: .LBB27_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -8229,124 +8393,243 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v4, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v6bf16_to_v12i8_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s3, 0
-; GFX11-NEXT: s_mov_b32 s3, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB39_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s13, s2, 16
-; GFX11-NEXT: s_lshr_b32 s12, s2, 8
-; GFX11-NEXT: s_lshr_b32 s8, s1, 24
-; GFX11-NEXT: s_lshr_b32 s14, s1, 16
-; GFX11-NEXT: s_lshr_b32 s9, s1, 8
-; GFX11-NEXT: s_lshr_b32 s11, s0, 16
-; GFX11-NEXT: s_lshr_b32 s10, s0, 8
-; GFX11-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
-; GFX11-NEXT: s_cbranch_vccnz .LBB39_4
-; GFX11-NEXT: .LBB39_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s2
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v8
-; GFX11-NEXT: v_mov_b32_e32 v12, 0x7fc07fc0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v0, v8, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v2, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v0, v11, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
-; GFX11-NEXT: v_bfe_u32 v3, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v9 :: v_dual_add_nc_u32 v3, v3, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshl_or_b32 v11, v7, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v11
-; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12]
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-NEXT: v_mov_b32_e32 v4, v13
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB39_3:
-; GFX11-NEXT: ; implicit-def: $sgpr10
-; GFX11-NEXT: ; implicit-def: $sgpr11
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr9
-; GFX11-NEXT: ; implicit-def: $sgpr14
-; GFX11-NEXT: ; implicit-def: $sgpr8
-; GFX11-NEXT: ; implicit-def: $sgpr12
-; GFX11-NEXT: ; implicit-def: $sgpr13
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: s_branch .LBB39_2
-; GFX11-NEXT: .LBB39_4:
-; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s12
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s10
-; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s8
-; GFX11-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s9
-; GFX11-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v11, s6
-; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v12i8_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB39_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s1, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s1, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB39_4
+; GFX11-TRUE16-NEXT: .LBB39_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s1, 0, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, 0x7fc07fc0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v8, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v3.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v13
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB39_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: s_branch .LBB39_2
+; GFX11-TRUE16-NEXT: .LBB39_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v11, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v12i8_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB39_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s2, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s1, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s1, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB39_4
+; GFX11-FAKE16-NEXT: .LBB39_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, 0x7fc07fc0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v2, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v0, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v9 :: v_dual_add_nc_u32 v3, v3, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v7, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v13
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB39_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: s_branch .LBB39_2
+; GFX11-FAKE16-NEXT: .LBB39_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v11, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -11712,89 +11995,169 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v6bf16_to_v6f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s3, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB49_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_4
-; GFX11-NEXT: .LBB49_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s2
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v2
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s3
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v8, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v9 :: v_dual_add_nc_u32 v1, v1, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v9
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v6
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_lshl_or_b32 v2, v4, 16, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB49_3:
-; GFX11-NEXT: s_branch .LBB49_2
-; GFX11-NEXT: .LBB49_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v6f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_4
+; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB49_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB49_2
+; GFX11-TRUE16-NEXT: .LBB49_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v6f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_4
+; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v9 :: v_dual_add_nc_u32 v1, v1, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v4, 16, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB49_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB49_2
+; GFX11-FAKE16-NEXT: .LBB49_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -12306,64 +12669,57 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v9, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v6
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v10, v9, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v10, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v12, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v10 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v0, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v5, 16, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v8, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v8, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v7, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
; GFX11-TRUE16-NEXT: .LBB52_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -12651,80 +13007,151 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v6bf16_to_v6i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s3, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB53_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_4
-; GFX11-NEXT: .LBB53_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s2
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v4, 0x7fff, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v10, v6
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_add_nc_u32 v7, v7, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v4, v5
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v3, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v7
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB53_3:
-; GFX11-NEXT: s_branch .LBB53_2
-; GFX11-NEXT: .LBB53_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v6i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_4
+; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v4, v9 :: v_dual_add_nc_u32 v9, v10, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v2, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB53_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB53_2
+; GFX11-TRUE16-NEXT: .LBB53_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v6i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_4
+; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v4, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v10, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_add_nc_u32 v7, v7, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v4, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v3, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v7
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB53_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB53_2
+; GFX11-FAKE16-NEXT: .LBB53_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index e71bf15..e34aaf20 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -136,7 +136,7 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, 5, v1, vcc
; GCN-NEXT: v_sub_u32_e32 v1, vcc, 0, v0
-; GCN-NEXT: v_max_i32_e32 v1, v0, v1
+; GCN-NEXT: v_max_i32_e32 v1, v1, v0
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1
; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v1
; GCN-NEXT: s_mov_b32 s4, 0xf4240
@@ -218,7 +218,7 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 5, vcc
; GCN-NEXT: v_sub_u32_e32 v1, vcc, 0, v0
-; GCN-NEXT: v_max_i32_e32 v1, v0, v1
+; GCN-NEXT: v_max_i32_e32 v1, v1, v0
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1
; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v1
; GCN-NEXT: s_mov_b32 s4, 0xf4240
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index e27164c..948811e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -6191,37 +6191,34 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT: s_ashr_i32 s8, s3, 31
-; GFX6-NEXT: s_add_i32 s3, s3, s8
-; GFX6-NEXT: s_xor_b32 s3, s3, s8
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT: s_sub_i32 s4, 0, s3
-; GFX6-NEXT: s_ashr_i32 s9, s2, 31
-; GFX6-NEXT: s_add_i32 s2, s2, s9
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT: s_xor_b32 s2, s2, s9
+; GFX6-NEXT: s_abs_i32 s8, s3
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT: s_sub_i32 s4, 0, s8
+; GFX6-NEXT: s_abs_i32 s9, s2
; GFX6-NEXT: s_mov_b32 s5, s1
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: s_mul_i32 s0, s0, s3
-; GFX6-NEXT: s_sub_i32 s0, s2, s0
-; GFX6-NEXT: s_sub_i32 s1, s0, s3
+; GFX6-NEXT: s_mul_i32 s0, s0, s8
+; GFX6-NEXT: s_sub_i32 s0, s9, s0
+; GFX6-NEXT: s_sub_i32 s1, s0, s8
; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT: s_cmp_ge_u32 s0, s3
+; GFX6-NEXT: s_cmp_ge_u32 s0, s8
; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
; GFX6-NEXT: s_cselect_b32 s0, s1, s0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT: s_cmp_ge_u32 s0, s3
+; GFX6-NEXT: s_cmp_ge_u32 s0, s8
; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT: s_xor_b32 s0, s2, s3
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT: s_xor_b32 s0, s9, s8
+; GFX6-NEXT: s_ashr_i32 s0, s0, 31
; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -6233,35 +6230,32 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_add_i32 s3, s3, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s4
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_sub_i32 s6, 0, s3
-; GFX9-NEXT: s_ashr_i32 s5, s2, 31
-; GFX9-NEXT: s_add_i32 s2, s2, s5
+; GFX9-NEXT: s_abs_i32 s4, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT: s_sub_i32 s6, 0, s4
+; GFX9-NEXT: s_abs_i32 s5, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s2, s2, s5
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s7, v0
; GFX9-NEXT: s_mul_i32 s6, s6, s7
; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6
; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_mul_hi_u32 s6, s2, s7
-; GFX9-NEXT: s_mul_i32 s8, s6, s3
-; GFX9-NEXT: s_sub_i32 s2, s2, s8
+; GFX9-NEXT: s_mul_hi_u32 s6, s5, s7
+; GFX9-NEXT: s_mul_i32 s8, s6, s4
+; GFX9-NEXT: s_sub_i32 s5, s5, s8
; GFX9-NEXT: s_add_i32 s7, s6, 1
-; GFX9-NEXT: s_sub_i32 s8, s2, s3
-; GFX9-NEXT: s_cmp_ge_u32 s2, s3
+; GFX9-NEXT: s_sub_i32 s8, s5, s4
+; GFX9-NEXT: s_cmp_ge_u32 s5, s4
; GFX9-NEXT: s_cselect_b32 s6, s7, s6
-; GFX9-NEXT: s_cselect_b32 s2, s8, s2
+; GFX9-NEXT: s_cselect_b32 s5, s8, s5
; GFX9-NEXT: s_add_i32 s7, s6, 1
-; GFX9-NEXT: s_cmp_ge_u32 s2, s3
-; GFX9-NEXT: s_cselect_b32 s2, s7, s6
-; GFX9-NEXT: s_xor_b32 s3, s5, s4
+; GFX9-NEXT: s_cmp_ge_u32 s5, s4
+; GFX9-NEXT: s_cselect_b32 s4, s7, s6
; GFX9-NEXT: s_xor_b32 s2, s2, s3
-; GFX9-NEXT: s_sub_i32 s2, s2, s3
+; GFX9-NEXT: s_ashr_i32 s2, s2, 31
+; GFX9-NEXT: s_xor_b32 s3, s4, s2
+; GFX9-NEXT: s_sub_i32 s2, s3, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -6706,38 +6700,37 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX6-LABEL: srem_i32_pow2_shl_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT: s_ashr_i32 s4, s3, 31
-; GFX6-NEXT: s_add_i32 s3, s3, s4
-; GFX6-NEXT: s_xor_b32 s4, s3, s4
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX6-NEXT: s_sub_i32 s3, 0, s4
-; GFX6-NEXT: s_ashr_i32 s5, s2, 31
-; GFX6-NEXT: s_add_i32 s2, s2, s5
+; GFX6-NEXT: s_abs_i32 s3, s3
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX6-NEXT: s_sub_i32 s4, 0, s3
+; GFX6-NEXT: s_abs_i32 s8, s2
+; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT: s_xor_b32 s6, s2, s5
-; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0
-; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0
+; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT: v_readfirstlane_b32 s7, v0
-; GFX6-NEXT: s_mul_i32 s7, s7, s4
-; GFX6-NEXT: s_sub_i32 s6, s6, s7
-; GFX6-NEXT: s_sub_i32 s7, s6, s4
-; GFX6-NEXT: s_cmp_ge_u32 s6, s4
-; GFX6-NEXT: s_cselect_b32 s6, s7, s6
-; GFX6-NEXT: s_sub_i32 s7, s6, s4
-; GFX6-NEXT: s_cmp_ge_u32 s6, s4
-; GFX6-NEXT: s_cselect_b32 s4, s7, s6
-; GFX6-NEXT: s_xor_b32 s4, s4, s5
-; GFX6-NEXT: s_sub_i32 s4, s4, s5
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_mul_i32 s0, s0, s3
+; GFX6-NEXT: s_sub_i32 s0, s8, s0
+; GFX6-NEXT: s_sub_i32 s1, s0, s3
+; GFX6-NEXT: s_cmp_ge_u32 s0, s3
+; GFX6-NEXT: s_cselect_b32 s0, s1, s0
+; GFX6-NEXT: s_sub_i32 s1, s0, s3
+; GFX6-NEXT: s_cmp_ge_u32 s0, s3
+; GFX6-NEXT: s_cselect_b32 s0, s1, s0
+; GFX6-NEXT: s_ashr_i32 s1, s2, 31
+; GFX6-NEXT: s_xor_b32 s0, s0, s1
+; GFX6-NEXT: s_sub_i32 s0, s0, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_i32_pow2_shl_denom:
@@ -6746,32 +6739,29 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_add_i32 s3, s3, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s4
+; GFX9-NEXT: s_abs_i32 s3, s3
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX9-NEXT: s_sub_i32 s5, 0, s3
-; GFX9-NEXT: s_ashr_i32 s4, s2, 31
-; GFX9-NEXT: s_add_i32 s2, s2, s4
+; GFX9-NEXT: s_abs_i32 s4, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s2, s2, s4
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s6, v0
; GFX9-NEXT: s_mul_i32 s5, s5, s6
; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5
; GFX9-NEXT: s_add_i32 s6, s6, s5
-; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT: s_mul_hi_u32 s5, s4, s6
; GFX9-NEXT: s_mul_i32 s5, s5, s3
-; GFX9-NEXT: s_sub_i32 s2, s2, s5
-; GFX9-NEXT: s_sub_i32 s5, s2, s3
-; GFX9-NEXT: s_cmp_ge_u32 s2, s3
-; GFX9-NEXT: s_cselect_b32 s2, s5, s2
-; GFX9-NEXT: s_sub_i32 s5, s2, s3
-; GFX9-NEXT: s_cmp_ge_u32 s2, s3
-; GFX9-NEXT: s_cselect_b32 s2, s5, s2
-; GFX9-NEXT: s_xor_b32 s2, s2, s4
-; GFX9-NEXT: s_sub_i32 s2, s2, s4
+; GFX9-NEXT: s_sub_i32 s4, s4, s5
+; GFX9-NEXT: s_sub_i32 s5, s4, s3
+; GFX9-NEXT: s_cmp_ge_u32 s4, s3
+; GFX9-NEXT: s_cselect_b32 s4, s5, s4
+; GFX9-NEXT: s_sub_i32 s5, s4, s3
+; GFX9-NEXT: s_cmp_ge_u32 s4, s3
+; GFX9-NEXT: s_cselect_b32 s3, s5, s4
+; GFX9-NEXT: s_ashr_i32 s2, s2, 31
+; GFX9-NEXT: s_xor_b32 s3, s3, s2
+; GFX9-NEXT: s_sub_i32 s2, s3, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
index 861621b..c1b8bc6 100644
--- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
@@ -410,26 +410,14 @@ define void @undef_lo2_v4i16(<2 x i16> %arg0) {
; GFX11-FAKE16-NEXT: ;;#ASMEND
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-SDAG-LABEL: undef_lo2_v4i16:
-; GFX11-TRUE16-SDAG: ; %bb.0:
-; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-TRUE16-SDAG-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-SDAG-NEXT: ; use v[0:1]
-; GFX11-TRUE16-SDAG-NEXT: ;;#ASMEND
-; GFX11-TRUE16-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-TRUE16-GISEL-LABEL: undef_lo2_v4i16:
-; GFX11-TRUE16-GISEL: ; %bb.0:
-; GFX11-TRUE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-GISEL-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-GISEL-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-GISEL-NEXT: ; use v[0:1]
-; GFX11-TRUE16-GISEL-NEXT: ;;#ASMEND
-; GFX11-TRUE16-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: undef_lo2_v4i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: ;;#ASMSTART
+; GFX11-TRUE16-NEXT: ; use v[0:1]
+; GFX11-TRUE16-NEXT: ;;#ASMEND
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
%undef.lo = shufflevector <2 x i16> %arg0, <2 x i16> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
call void asm sideeffect "; use $0", "v"(<4 x i16> %undef.lo);
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index 3cf70c4..d7d697e 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -576,11 +576,11 @@ define i32 @sdiv32(i32 %a, i32 %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v2, 0, v1
-; GFX9-NEXT: v_max_i32_e32 v2, v1, v2
+; GFX9-NEXT: v_max_i32_e32 v2, v2, v1
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v2
; GFX9-NEXT: v_sub_u32_e32 v4, 0, v2
; GFX9-NEXT: v_sub_u32_e32 v5, 0, v0
-; GFX9-NEXT: v_max_i32_e32 v5, v0, v5
+; GFX9-NEXT: v_max_i32_e32 v5, v5, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v0
@@ -640,11 +640,11 @@ define i32 @srem32(i32 %a, i32 %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v2, 0, v1
-; GFX9-NEXT: v_max_i32_e32 v1, v1, v2
+; GFX9-NEXT: v_max_i32_e32 v1, v2, v1
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v1
; GFX9-NEXT: v_sub_u32_e32 v3, 0, v1
; GFX9-NEXT: v_sub_u32_e32 v4, 0, v0
-; GFX9-NEXT: v_max_i32_e32 v4, v0, v4
+; GFX9-NEXT: v_max_i32_e32 v4, v4, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v0
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 9c59b42..ab96dcf 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -563,10 +563,9 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) {
; GFX11-TRUE16-LABEL: divergent_vec_i16_HH:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: divergent_vec_i16_HH:
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index 043bcc3..f64615d 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -264,3 +264,90 @@ body: |
$sgpr0 = COPY %16:sreg_32
SI_RETURN_TO_EPILOG $sgpr0
...
+
+---
+name: s_pack_ll_b32_b16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_pack_ll_b32_b16
+ ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[DEF]], implicit $exec
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B32_e32_]].lo16, %subreg.lo16, [[DEF1]].lo16, %subreg.hi16
+ %0:sreg_32 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %1:vgpr_32
+ %3:sreg_32 = S_PACK_LL_B32_B16 %0:sreg_32, %2:sreg_32, implicit-def dead $scc
+...
+
+---
+name: s_pack_lh_b32_b16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_pack_lh_b32_b16
+ ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[DEF]], implicit $exec
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B32_e32_]].lo16, %subreg.lo16, [[DEF1]].hi16, %subreg.hi16
+ %0:sreg_32 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %1:vgpr_32
+ %3:sreg_32 = S_PACK_LH_B32_B16 %0:sreg_32, %2:sreg_32, implicit-def dead $scc
+...
+
+---
+name: s_pack_hl_b32_b16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_pack_hl_b32_b16
+ ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[DEF]], implicit $exec
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B32_e32_]].hi16, %subreg.lo16, [[DEF1]].lo16, %subreg.hi16
+ %0:sreg_32 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %1:vgpr_32
+ %3:sreg_32 = S_PACK_HL_B32_B16 %0:sreg_32, %2:sreg_32, implicit-def dead $scc
+...
+
+---
+name: s_pack_hh_b32_b16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_pack_hh_b32_b16
+ ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[DEF]], implicit $exec
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B32_e32_]].hi16, %subreg.lo16, [[DEF1]].hi16, %subreg.hi16
+ %0:sreg_32 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %1:vgpr_32
+ %3:sreg_32 = S_PACK_HH_B32_B16 %0:sreg_32, %2:sreg_32, implicit-def dead $scc
+...
+
+---
+name: s_pack_ll_b32_b16_use_SALU16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_pack_ll_b32_b16_use_SALU16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_FMAC_F16_t16_e64 0, [[DEF]].lo16, 0, [[DEF]].lo16, 0, [[DEF]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FMAC_F16_t16_e64_]], %subreg.lo16, [[DEF]].lo16, %subreg.hi16
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = COPY %0:vgpr_32
+ %2:sreg_32 = S_FMAC_F16 %1:sreg_32, %1:sreg_32, %1:sreg_32, implicit $mode
+ %3:sreg_32 = S_PACK_LL_B32_B16 %2:sreg_32, %1:sreg_32, implicit-def dead $scc
+...
+
+---
+name: s_pack_ll_b32_b16_use_imm
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_pack_ll_b32_b16_use_imm
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B32_e32_]].lo16, %subreg.lo16, [[DEF]].lo16, %subreg.hi16
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = COPY %0:vgpr_32
+ %2:sreg_32 = S_PACK_LL_B32_B16 1, %1:sreg_32, implicit-def dead $scc
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
index 76da0aa..10c60df 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
@@ -478,41 +478,76 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_fneg_fabs_v2bf16_non_bc_src:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, s1, 1.0
-; GFX11-NEXT: v_add_f32_e64 v1, s0, 2.0
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_or_b32_e32 v0, 0x80008000, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_fneg_fabs_v2bf16_non_bc_src:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, s1, 2.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, s0, 1.0
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x80008000, v1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_fneg_fabs_v2bf16_non_bc_src:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, s1, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, s0, 2.0
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x80008000, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%add = fadd <2 x bfloat> %in, <bfloat 1.0, bfloat 2.0>
%fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %add)
%fneg.fabs = fsub <2 x bfloat> <bfloat -0.0, bfloat -0.0>, %fabs
@@ -752,42 +787,78 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fold_user_fneg_fabs_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s1, s0, 0x7fff
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX11-NEXT: v_mul_f32_e64 v0, s1, -4.0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e64 v1, s0, -4.0
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fold_user_fneg_fabs_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s0, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0x7fff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_mul_f32_e64 v1, s0, -4.0
+; GFX11-TRUE16-NEXT: v_mul_f32_e64 v0, s1, -4.0
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fold_user_fneg_fabs_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0x7fff
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX11-FAKE16-NEXT: v_mul_f32_e64 v0, s1, -4.0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_mul_f32_e64 v1, s0, -4.0
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
%fneg.fabs = fsub <2 x bfloat> <bfloat -0.0, bfloat -0.0>, %fabs
%mul = fmul <2 x bfloat> %fneg.fabs, <bfloat 4.0, bfloat 4.0>
@@ -975,46 +1046,88 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x10
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s0, s6, 0x7fff
-; GFX11-NEXT: s_lshr_b32 s1, s6, 16
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: s_and_b32 s1, s1, 0x7fff
-; GFX11-NEXT: v_mul_f32_e64 v0, s0, -4.0
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e64 v1, s0, -4.0
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_and_b32 s4, s6, 0x7fff7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v2, v3, s[0:1]
-; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x10
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s6, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s6, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_mul_f32_e64 v0, s0, -4.0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mul_f32_e64 v1, s0, -4.0
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s6, 0x7fff7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: global_store_b32 v2, v3, s[0:1]
+; GFX11-TRUE16-NEXT: global_store_b32 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x10
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s6, 0x7fff
+; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s6, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0x7fff
+; GFX11-FAKE16-NEXT: v_mul_f32_e64 v0, s0, -4.0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_mul_f32_e64 v1, s0, -4.0
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s6, 0x7fff7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: global_store_b32 v2, v3, s[0:1]
+; GFX11-FAKE16-NEXT: global_store_b32 v2, v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_endpgm
%fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
%fneg = fsub <2 x bfloat> <bfloat -0.0, bfloat -0.0>, %fabs
%mul = fmul <2 x bfloat> %fneg, <bfloat 4.0, bfloat 4.0>
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
index 98044a7..84b904f 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
@@ -712,47 +712,88 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: v_fneg_fold_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_xor_b32_e32 v3, 0x8000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mul_f32 v3, v3, v4 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v2, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_fneg_fold_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mul_f32 v1, v3, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_fneg_fold_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mul_f32 v3, v3, v4 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%val = load <2 x bfloat>, ptr addrspace(1) %in
%fsub = fsub <2 x bfloat> <bfloat -0.0, bfloat -0.0>, %val
%fmul = fmul <2 x bfloat> %fsub, %val
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index f048dc5..a43292d 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -330,11 +330,8 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v1.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.h, v1.l
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index 96abb3a..96cb621 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -329,11 +329,8 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v1.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.h, v1.l
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 415828f..35d178c 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -5972,16 +5972,14 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: .LBB9_16: ; %Flow54
; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1.l
; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v0.l|
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v4.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, s2
; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v3.l|
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v7.l, s2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v7.l, s2
; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -6422,19 +6420,16 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s2, 0x7c00
; GFX1150-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_and_b32 s2, s2, s3
; GFX1150-TRUE16-NEXT: s_cmp_lg_f16 s5, 0
; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
; GFX1150-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s4, 0x7c00
-; GFX1150-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_and_b32 s2, s3, s2
-; GFX1150-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s2
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s2
; GFX1150-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX1150-TRUE16-NEXT: s_endpgm
;
@@ -6902,20 +6897,17 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1200-TRUE16-NEXT: s_cmp_nge_f16 s2, 0x7c00
; GFX1200-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1200-TRUE16-NEXT: s_and_b32 s2, s2, s3
; GFX1200-TRUE16-NEXT: s_cmp_lg_f16 s5, 0
; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
; GFX1200-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX1200-TRUE16-NEXT: s_cmp_nge_f16 s4, 0x7c00
-; GFX1200-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_and_b32 s2, s3, s2
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX1200-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s2
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s2
; GFX1200-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX1200-TRUE16-NEXT: s_endpgm
;
@@ -9346,29 +9338,23 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: .LBB10_32: ; %Flow124
; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v2.l
; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v0.l|
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v6.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v4.l, s2
; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v5.l|
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v7.l, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v7.l, s2
; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v1.l|
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v10.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v8.l, s2
; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v9.l|
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, 0x7e00, v11.l, s2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v4, 16, v1
-; GFX11-TRUE16-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v11.l, s2
+; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: frem_v4f16:
@@ -10209,21 +10195,19 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_cselect_b32 s4, -1, 0
; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s3, 0x7c00
; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_and_b32 s3, s3, s4
; GFX1150-TRUE16-NEXT: s_cmp_lg_f16 s6, 0
; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s3
; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s5, 0x7c00
-; GFX1150-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX1150-TRUE16-NEXT: s_cselect_b32 s4, -1, 0
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_and_b32 s3, s4, s3
; GFX1150-TRUE16-NEXT: s_cmp_lg_f16 s2, 0
-; GFX1150-TRUE16-NEXT: v_cndmask_b16 v4.l, 0x7e00, v1.l, s3
+; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s3
; GFX1150-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s8, 0x7c00
-; GFX1150-TRUE16-NEXT: v_lshl_or_b32 v0, v4, 16, v0
; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_and_b32 s2, s3, s2
@@ -10232,13 +10216,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX1150-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s7, 0x7c00
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1150-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_and_b32 s2, s3, s2
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_cndmask_b16 v3.l, 0x7e00, v3.l, s2
-; GFX1150-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX1150-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v3.l, s2
; GFX1150-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1150-TRUE16-NEXT: s_endpgm
;
@@ -11147,18 +11128,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s3
; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1200-TRUE16-NEXT: s_cmp_nge_f16 s5, 0x7c00
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX1200-TRUE16-NEXT: s_cselect_b32 s4, -1, 0
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_and_b32 s3, s4, s3
; GFX1200-TRUE16-NEXT: s_cmp_lg_f16 s2, 0
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX1200-TRUE16-NEXT: v_cndmask_b16 v4.l, 0x7e00, v1.l, s3
+; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s3
; GFX1200-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX1200-TRUE16-NEXT: s_cmp_nge_f16 s8, 0x7c00
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_lshl_or_b32 v0, v4, 16, v0
; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_and_b32 s2, s3, s2
@@ -11168,15 +11145,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX1200-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX1200-TRUE16-NEXT: s_cmp_nge_f16 s7, 0x7c00
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1200-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_and_b32 s2, s3, s2
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX1200-TRUE16-NEXT: v_cndmask_b16 v3.l, 0x7e00, v3.l, s2
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX1200-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v3.l, s2
; GFX1200-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1200-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 792d7db..76016e4 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -850,15 +850,13 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e64 v2, 16, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
index a2c1545..447a5f2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
@@ -361,12 +361,10 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
@@ -444,12 +442,10 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
index 6f7c001..2e0e420 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
@@ -361,12 +361,10 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
@@ -444,12 +442,10 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
index 8896364..ebb33684 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
@@ -455,12 +455,10 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32
; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
@@ -550,12 +548,10 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32
; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
index 23db247..40be567 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
@@ -455,12 +455,10 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8)
; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
@@ -550,12 +548,10 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8)
; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index 92a2f54..068a989 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -1053,19 +1053,15 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-TRUE16-LABEL: s_maximum_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, s0, s1
; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, s0, s1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 16
; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, s0, s1
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, s3, s2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s1
; GFX11-TRUE16-NEXT: ;;#ASMSTART
; GFX11-TRUE16-NEXT: ; use v0
; GFX11-TRUE16-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 9e82b41..2482d10 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -866,19 +866,15 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-TRUE16-LABEL: s_minimum_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, s0, s1
; GFX11-TRUE16-NEXT: v_pk_min_f16 v0, s0, s1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 16
; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, s0, s1
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, s3, s2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s1
; GFX11-TRUE16-NEXT: ;;#ASMSTART
; GFX11-TRUE16-NEXT: ; use v0
; GFX11-TRUE16-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
index dcf01f7..818dff4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
@@ -63,14 +63,10 @@ define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a)
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_sqrt_bf16_e32 v1.l, v0.l
-; GFX12-TRUE16-NEXT: v_nop
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v1.l
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index b534c2c..6f63384 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -9604,11 +9604,11 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v1
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v2
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
; GFX12-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
;
@@ -9738,11 +9738,11 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_bfe_i32 v2, v1, 0, 16
; GFX12-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v2
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
; GFX12-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index 5c0f813..441509b 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -391,156 +391,144 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa
define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-LABEL: sdiv_v2i32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s10, s2
-; GCN-NEXT: s_mov_b32 s11, s3
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s8, s6
-; GCN-NEXT: s_mov_b32 s9, s7
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readfirstlane_b32 s0, v2
-; GCN-NEXT: s_abs_i32 s1, s0
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, s1
-; GCN-NEXT: s_sub_i32 s6, 0, s1
-; GCN-NEXT: v_readfirstlane_b32 s8, v3
-; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: v_mul_lo_u32 v4, s6, v2
-; GCN-NEXT: v_readfirstlane_b32 s6, v0
-; GCN-NEXT: s_abs_i32 s7, s6
-; GCN-NEXT: s_xor_b32 s0, s6, s0
-; GCN-NEXT: v_mul_hi_u32 v4, v2, v4
-; GCN-NEXT: s_ashr_i32 s6, s0, 31
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v4
-; GCN-NEXT: v_mul_hi_u32 v0, s7, v0
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_mul_i32 s0, s0, s1
-; GCN-NEXT: s_sub_i32 s0, s7, s0
-; GCN-NEXT: s_sub_i32 s7, s0, s1
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT: s_cmp_ge_u32 s0, s1
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT: s_cselect_b32 s0, s7, s0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT: s_cmp_ge_u32 s0, s1
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_abs_i32 s7, s8
-; GCN-NEXT: v_cvt_f32_u32_e32 v3, s7
-; GCN-NEXT: s_mov_b32 s0, s4
-; GCN-NEXT: s_sub_i32 s4, 0, s7
-; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, s6, v0
-; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0
-; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT: v_mul_lo_u32 v4, s4, v3
-; GCN-NEXT: v_readfirstlane_b32 s4, v1
-; GCN-NEXT: s_xor_b32 s5, s4, s8
-; GCN-NEXT: s_abs_i32 s4, s4
-; GCN-NEXT: v_mul_hi_u32 v1, v3, v4
-; GCN-NEXT: s_ashr_i32 s5, s5, 31
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1
-; GCN-NEXT: v_mul_hi_u32 v1, s4, v1
-; GCN-NEXT: v_readfirstlane_b32 s6, v1
-; GCN-NEXT: s_mul_i32 s6, s6, s7
-; GCN-NEXT: s_sub_i32 s4, s4, s6
-; GCN-NEXT: s_sub_i32 s6, s4, s7
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1
-; GCN-NEXT: s_cmp_ge_u32 s4, s7
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT: s_cselect_b32 s4, s6, s4
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1
-; GCN-NEXT: s_cmp_ge_u32 s4, s7
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT: v_xor_b32_e32 v1, s5, v1
-; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s5, v1
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
+; GCN-NEXT: v_xor_b32_e32 v4, v0, v2
+; GCN-NEXT: v_xor_b32_e32 v7, v1, v3
+; GCN-NEXT: v_max_i32_e32 v2, v2, v6
+; GCN-NEXT: v_max_i32_e32 v3, v3, v9
+; GCN-NEXT: v_cvt_f32_u32_e32 v6, v2
+; GCN-NEXT: v_cvt_f32_u32_e32 v9, v3
+; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0
+; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GCN-NEXT: v_max_i32_e32 v0, v0, v5
+; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v9
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2
+; GCN-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v3
+; GCN-NEXT: v_mul_lo_u32 v9, v9, v6
+; GCN-NEXT: v_mul_lo_u32 v10, v10, v5
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v1
+; GCN-NEXT: v_mul_hi_u32 v9, v6, v9
+; GCN-NEXT: v_max_i32_e32 v1, v1, v8
+; GCN-NEXT: v_mul_hi_u32 v8, v5, v10
+; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v4
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GCN-NEXT: v_mul_hi_u32 v6, v0, v6
+; GCN-NEXT: v_mul_hi_u32 v5, v1, v5
+; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7
+; GCN-NEXT: v_mul_lo_u32 v8, v6, v2
+; GCN-NEXT: v_mul_lo_u32 v10, v5, v3
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v6
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v10
+; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5
+; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
+; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, v0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1]
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, v1, v3
+; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1]
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v6
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3]
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v5
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, v0, v4
+; GCN-NEXT: v_xor_b32_e32 v1, v1, v7
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; TONGA-LABEL: sdiv_v2i32:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
-; TONGA-NEXT: s_mov_b32 s3, 0xf000
-; TONGA-NEXT: s_mov_b32 s2, -1
-; TONGA-NEXT: s_mov_b32 s10, s2
-; TONGA-NEXT: s_mov_b32 s11, s3
+; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; TONGA-NEXT: s_mov_b32 s7, 0xf000
+; TONGA-NEXT: s_mov_b32 s6, -1
+; TONGA-NEXT: s_mov_b32 s10, s6
+; TONGA-NEXT: s_mov_b32 s11, s7
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s6
-; TONGA-NEXT: s_mov_b32 s9, s7
+; TONGA-NEXT: s_mov_b32 s8, s2
+; TONGA-NEXT: s_mov_b32 s9, s3
; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; TONGA-NEXT: s_mov_b32 s4, s0
+; TONGA-NEXT: s_mov_b32 s5, s1
; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_readfirstlane_b32 s0, v2
-; TONGA-NEXT: s_abs_i32 s1, s0
-; TONGA-NEXT: v_cvt_f32_u32_e32 v2, s1
-; TONGA-NEXT: s_sub_i32 s6, 0, s1
-; TONGA-NEXT: v_readfirstlane_b32 s8, v3
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; TONGA-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2
-; TONGA-NEXT: v_mul_lo_u32 v4, s6, v2
-; TONGA-NEXT: v_readfirstlane_b32 s6, v0
-; TONGA-NEXT: s_abs_i32 s7, s6
-; TONGA-NEXT: s_xor_b32 s0, s6, s0
-; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4
-; TONGA-NEXT: s_ashr_i32 s6, s0, 31
-; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v4
-; TONGA-NEXT: v_mul_hi_u32 v0, s7, v0
-; TONGA-NEXT: v_readfirstlane_b32 s0, v0
-; TONGA-NEXT: s_mul_i32 s0, s0, s1
-; TONGA-NEXT: s_sub_i32 s0, s7, s0
-; TONGA-NEXT: s_sub_i32 s7, s0, s1
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v0
-; TONGA-NEXT: s_cmp_ge_u32 s0, s1
-; TONGA-NEXT: s_cselect_b64 vcc, -1, 0
-; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; TONGA-NEXT: s_cselect_b32 s0, s7, s0
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v0
-; TONGA-NEXT: s_cmp_ge_u32 s0, s1
-; TONGA-NEXT: s_cselect_b64 vcc, -1, 0
-; TONGA-NEXT: s_abs_i32 s7, s8
-; TONGA-NEXT: v_cvt_f32_u32_e32 v3, s7
-; TONGA-NEXT: s_mov_b32 s0, s4
-; TONGA-NEXT: s_sub_i32 s4, 0, s7
-; TONGA-NEXT: s_mov_b32 s1, s5
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; TONGA-NEXT: v_xor_b32_e32 v0, s6, v0
-; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, s6, v0
-; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3
-; TONGA-NEXT: v_mul_lo_u32 v4, s4, v3
-; TONGA-NEXT: v_readfirstlane_b32 s4, v1
-; TONGA-NEXT: s_xor_b32 s5, s4, s8
-; TONGA-NEXT: s_abs_i32 s4, s4
-; TONGA-NEXT: v_mul_hi_u32 v1, v3, v4
-; TONGA-NEXT: s_ashr_i32 s5, s5, 31
-; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1
-; TONGA-NEXT: v_mul_hi_u32 v1, s4, v1
-; TONGA-NEXT: v_readfirstlane_b32 s6, v1
-; TONGA-NEXT: s_mul_i32 s6, s6, s7
-; TONGA-NEXT: s_sub_i32 s4, s4, s6
-; TONGA-NEXT: s_sub_i32 s6, s4, s7
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v1
-; TONGA-NEXT: s_cmp_ge_u32 s4, s7
-; TONGA-NEXT: s_cselect_b64 vcc, -1, 0
-; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; TONGA-NEXT: s_cselect_b32 s4, s6, s4
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v1
-; TONGA-NEXT: s_cmp_ge_u32 s4, s7
-; TONGA-NEXT: s_cselect_b64 vcc, -1, 0
-; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; TONGA-NEXT: v_xor_b32_e32 v1, s5, v1
-; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s5, v1
-; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v2
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v3
+; TONGA-NEXT: v_xor_b32_e32 v4, v0, v2
+; TONGA-NEXT: v_xor_b32_e32 v7, v1, v3
+; TONGA-NEXT: v_max_i32_e32 v2, v2, v6
+; TONGA-NEXT: v_max_i32_e32 v3, v3, v9
+; TONGA-NEXT: v_cvt_f32_u32_e32 v6, v2
+; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v3
+; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; TONGA-NEXT: v_max_i32_e32 v0, v0, v5
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v9
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2
+; TONGA-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; TONGA-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; TONGA-NEXT: v_cvt_u32_f32_e32 v6, v6
+; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5
+; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v3
+; TONGA-NEXT: v_mul_lo_u32 v9, v9, v6
+; TONGA-NEXT: v_mul_lo_u32 v10, v10, v5
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, 0, v1
+; TONGA-NEXT: v_mul_hi_u32 v9, v6, v9
+; TONGA-NEXT: v_max_i32_e32 v1, v1, v8
+; TONGA-NEXT: v_mul_hi_u32 v8, v5, v10
+; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v4
+; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v9
+; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v8
+; TONGA-NEXT: v_mul_hi_u32 v6, v0, v6
+; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5
+; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v7
+; TONGA-NEXT: v_mul_lo_u32 v8, v6, v2
+; TONGA-NEXT: v_mul_lo_u32 v10, v5, v3
+; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v6
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8
+; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v10
+; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v0, v2
+; TONGA-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1]
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v1, v3
+; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3]
+; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1]
+; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v6
+; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3]
+; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v5
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; TONGA-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
+; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4
+; TONGA-NEXT: v_xor_b32_e32 v1, v1, v7
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
+; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v7
+; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v2i32:
@@ -558,44 +546,44 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_readfirstlane_b32 s0, v2
; GFX9-NEXT: s_abs_i32 s1, s0
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s1
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_xor_b32 s0, s4, s0
+; GFX9-NEXT: v_readfirstlane_b32 s5, v0
+; GFX9-NEXT: s_xor_b32 s0, s5, s0
; GFX9-NEXT: s_ashr_i32 s6, s0, 31
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX9-NEXT: s_sub_i32 s0, 0, s1
-; GFX9-NEXT: s_abs_i32 s4, s4
-; GFX9-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9-NEXT: s_abs_i32 s5, s5
+; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s7, v0
; GFX9-NEXT: s_mul_i32 s0, s0, s7
; GFX9-NEXT: s_mul_hi_u32 s0, s7, s0
; GFX9-NEXT: s_add_i32 s7, s7, s0
-; GFX9-NEXT: s_mul_hi_u32 s0, s4, s7
+; GFX9-NEXT: s_mul_hi_u32 s0, s5, s7
; GFX9-NEXT: s_mul_i32 s7, s0, s1
-; GFX9-NEXT: s_sub_i32 s4, s4, s7
+; GFX9-NEXT: s_sub_i32 s5, s5, s7
; GFX9-NEXT: s_add_i32 s10, s0, 1
-; GFX9-NEXT: s_sub_i32 s7, s4, s1
-; GFX9-NEXT: s_cmp_ge_u32 s4, s1
+; GFX9-NEXT: s_sub_i32 s7, s5, s1
+; GFX9-NEXT: s_cmp_ge_u32 s5, s1
; GFX9-NEXT: s_cselect_b32 s0, s10, s0
-; GFX9-NEXT: s_cselect_b32 s4, s7, s4
+; GFX9-NEXT: s_cselect_b32 s5, s7, s5
; GFX9-NEXT: s_add_i32 s7, s0, 1
-; GFX9-NEXT: s_cmp_ge_u32 s4, s1
-; GFX9-NEXT: s_cselect_b32 s4, s7, s0
-; GFX9-NEXT: s_abs_i32 s7, s5
+; GFX9-NEXT: s_cmp_ge_u32 s5, s1
+; GFX9-NEXT: s_cselect_b32 s5, s7, s0
+; GFX9-NEXT: s_abs_i32 s7, s4
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT: s_xor_b32 s4, s4, s6
+; GFX9-NEXT: s_xor_b32 s5, s5, s6
; GFX9-NEXT: s_mov_b32 s1, s9
; GFX9-NEXT: s_sub_i32 s9, 0, s7
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_sub_i32 s4, s4, s6
+; GFX9-NEXT: s_sub_i32 s5, s5, s6
; GFX9-NEXT: s_mov_b32 s0, s8
; GFX9-NEXT: v_readfirstlane_b32 s8, v1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s5, s8, s5
+; GFX9-NEXT: s_xor_b32 s4, s8, s4
; GFX9-NEXT: s_abs_i32 s8, s8
-; GFX9-NEXT: s_ashr_i32 s5, s5, 31
+; GFX9-NEXT: s_ashr_i32 s4, s4, 31
; GFX9-NEXT: v_readfirstlane_b32 s6, v0
; GFX9-NEXT: s_mul_i32 s9, s9, s6
; GFX9-NEXT: s_mul_hi_u32 s9, s6, s9
@@ -611,10 +599,10 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_add_i32 s9, s6, 1
; GFX9-NEXT: s_cmp_ge_u32 s8, s7
; GFX9-NEXT: s_cselect_b32 s6, s9, s6
-; GFX9-NEXT: s_xor_b32 s6, s6, s5
-; GFX9-NEXT: s_sub_i32 s5, s6, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_xor_b32 s6, s6, s4
+; GFX9-NEXT: s_sub_i32 s4, s6, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -804,255 +792,255 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: sdiv_v4i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s10, -1
-; GCN-NEXT: s_mov_b32 s6, s10
-; GCN-NEXT: s_mov_b32 s7, s11
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s4, s2
-; GCN-NEXT: s_mov_b32 s5, s3
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
-; GCN-NEXT: s_mov_b32 s8, s0
-; GCN-NEXT: s_mov_b32 s9, s1
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: v_readfirstlane_b32 s1, v1
-; GCN-NEXT: v_readfirstlane_b32 s2, v2
-; GCN-NEXT: s_abs_i32 s13, s0
-; GCN-NEXT: s_abs_i32 s14, s1
-; GCN-NEXT: s_abs_i32 s15, s2
-; GCN-NEXT: v_cvt_f32_u32_e32 v0, s13
-; GCN-NEXT: v_cvt_f32_u32_e32 v1, s14
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, s15
-; GCN-NEXT: v_readfirstlane_b32 s6, v3
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4
+; GCN-NEXT: v_xor_b32_e32 v8, v0, v4
+; GCN-NEXT: v_max_i32_e32 v4, v4, v10
+; GCN-NEXT: v_cvt_f32_u32_e32 v10, v4
+; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v5
+; GCN-NEXT: v_xor_b32_e32 v11, v1, v5
+; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10
+; GCN-NEXT: v_max_i32_e32 v5, v5, v13
+; GCN-NEXT: v_cvt_f32_u32_e32 v13, v5
+; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v4
+; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10
+; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10
+; GCN-NEXT: v_rcp_iflag_f32_e32 v13, v13
+; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v1
+; GCN-NEXT: v_mul_lo_u32 v16, v16, v10
+; GCN-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13
+; GCN-NEXT: v_cvt_u32_f32_e32 v13, v13
+; GCN-NEXT: v_max_i32_e32 v0, v0, v9
+; GCN-NEXT: v_mul_hi_u32 v16, v10, v16
+; GCN-NEXT: v_max_i32_e32 v1, v1, v12
+; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v6
+; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v16
+; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v5
+; GCN-NEXT: v_mul_lo_u32 v16, v16, v13
+; GCN-NEXT: v_mul_hi_u32 v10, v0, v10
+; GCN-NEXT: v_xor_b32_e32 v14, v2, v6
+; GCN-NEXT: v_max_i32_e32 v6, v6, v15
+; GCN-NEXT: v_mul_hi_u32 v12, v13, v16
+; GCN-NEXT: v_cvt_f32_u32_e32 v15, v6
+; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8
+; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11
+; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GCN-NEXT: v_mul_lo_u32 v13, v10, v4
+; GCN-NEXT: v_mul_hi_u32 v12, v1, v12
+; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v15
+; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v14
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v13
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10
+; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
+; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1]
+; GCN-NEXT: v_sub_i32_e32 v13, vcc, v0, v4
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v13, s[0:1]
+; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
+; GCN-NEXT: v_mul_lo_u32 v0, v12, v5
+; GCN-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9
+; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9
+; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v6
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v12
+; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
+; GCN-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3]
+; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v5
+; GCN-NEXT: v_mul_lo_u32 v4, v4, v9
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3]
+; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v7
+; GCN-NEXT: v_max_i32_e32 v5, v7, v0
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, v5
+; GCN-NEXT: v_mul_hi_u32 v4, v9, v4
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10
; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT: s_abs_i32 s17, s6
-; GCN-NEXT: v_cvt_f32_u32_e32 v3, s17
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2
+; GCN-NEXT: v_max_i32_e32 v2, v2, v9
+; GCN-NEXT: v_mul_hi_u32 v4, v2, v4
; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readfirstlane_b32 s3, v4
-; GCN-NEXT: v_readfirstlane_b32 s4, v5
-; GCN-NEXT: v_readfirstlane_b32 s5, v6
-; GCN-NEXT: s_xor_b32 s12, s3, s0
-; GCN-NEXT: s_xor_b32 s0, s4, s1
-; GCN-NEXT: s_xor_b32 s1, s5, s2
-; GCN-NEXT: s_sub_i32 s2, 0, s13
-; GCN-NEXT: s_ashr_i32 s18, s0, 31
-; GCN-NEXT: s_sub_i32 s0, 0, s14
-; GCN-NEXT: s_ashr_i32 s19, s1, 31
-; GCN-NEXT: s_sub_i32 s1, 0, s15
-; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GCN-NEXT: v_mul_lo_u32 v4, s2, v0
-; GCN-NEXT: v_mul_lo_u32 v5, s0, v1
-; GCN-NEXT: v_mul_lo_u32 v6, s1, v2
-; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT: v_mul_hi_u32 v4, v0, v4
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v5
-; GCN-NEXT: v_mul_hi_u32 v6, v2, v6
-; GCN-NEXT: s_sub_i32 s20, 0, s17
-; GCN-NEXT: v_readfirstlane_b32 s7, v7
-; GCN-NEXT: s_abs_i32 s3, s3
-; GCN-NEXT: s_abs_i32 s4, s4
-; GCN-NEXT: s_abs_i32 s5, s5
-; GCN-NEXT: v_mul_lo_u32 v7, s20, v3
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; GCN-NEXT: v_mul_hi_u32 v0, s3, v0
-; GCN-NEXT: v_mul_hi_u32 v1, s4, v1
-; GCN-NEXT: v_mul_hi_u32 v2, s5, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v3, v7
-; GCN-NEXT: v_mul_lo_u32 v4, v0, s13
-; GCN-NEXT: v_mul_lo_u32 v6, v1, s14
-; GCN-NEXT: v_mul_lo_u32 v8, v2, s15
-; GCN-NEXT: s_abs_i32 s16, s7
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GCN-NEXT: v_mul_hi_u32 v3, s16, v3
-; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v4
-; GCN-NEXT: v_sub_i32_e32 v6, vcc, s4, v6
-; GCN-NEXT: v_sub_i32_e32 v8, vcc, s5, v8
-; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0
-; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v1
-; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v2
-; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4
-; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v6
-; GCN-NEXT: v_cmp_le_u32_e64 s[4:5], s15, v8
-; GCN-NEXT: v_subrev_i32_e32 v10, vcc, s13, v4
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1]
-; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s14, v6
-; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3]
-; GCN-NEXT: v_subrev_i32_e32 v7, vcc, s15, v8
-; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1]
-; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3]
-; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[4:5]
-; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v2
-; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v4
-; GCN-NEXT: v_mul_lo_u32 v4, v3, s17
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
-; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v5
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
-; GCN-NEXT: v_cmp_le_u32_e32 vcc, s15, v7
-; GCN-NEXT: s_ashr_i32 s12, s12, 31
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, s12, v0
-; GCN-NEXT: v_xor_b32_e32 v1, s18, v1
-; GCN-NEXT: v_xor_b32_e32 v2, s19, v2
-; GCN-NEXT: v_sub_i32_e32 v4, vcc, s16, v4
-; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0
-; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s18, v1
-; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s19, v2
-; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3
-; GCN-NEXT: v_subrev_i32_e32 v6, vcc, s17, v4
-; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v4
-; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3
-; GCN-NEXT: s_xor_b32 s0, s7, s6
-; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v4
-; GCN-NEXT: s_ashr_i32 s0, s0, 31
-; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT: v_xor_b32_e32 v3, s0, v3
-; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s0, v3
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT: v_cvt_u32_f32_e32 v9, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v10, v13, s[0:1]
+; GCN-NEXT: v_xor_b32_e32 v0, v0, v8
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT: v_mul_lo_u32 v8, v4, v6
+; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v1
+; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v5
+; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3]
+; GCN-NEXT: v_mul_lo_u32 v10, v10, v9
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6
+; GCN-NEXT: v_xor_b32_e32 v1, v1, v11
+; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, v2, v6
+; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v11
+; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
+; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc
+; GCN-NEXT: v_mul_hi_u32 v4, v9, v10
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
+; GCN-NEXT: v_max_i32_e32 v6, v3, v6
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; GCN-NEXT: v_mul_hi_u32 v4, v6, v4
+; GCN-NEXT: v_xor_b32_e32 v2, v2, v14
+; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v14
+; GCN-NEXT: v_mul_lo_u32 v8, v4, v5
+; GCN-NEXT: v_xor_b32_e32 v3, v3, v7
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, v6, v5
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
+; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
+; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3
+; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; GCN-NEXT: v_xor_b32_e32 v4, v4, v3
+; GCN-NEXT: v_sub_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; TONGA-LABEL: sdiv_v4i32:
; TONGA: ; %bb.0:
; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; TONGA-NEXT: s_mov_b32 s11, 0xf000
-; TONGA-NEXT: s_mov_b32 s10, -1
-; TONGA-NEXT: s_mov_b32 s6, s10
-; TONGA-NEXT: s_mov_b32 s7, s11
+; TONGA-NEXT: s_mov_b32 s7, 0xf000
+; TONGA-NEXT: s_mov_b32 s6, -1
+; TONGA-NEXT: s_mov_b32 s10, s6
+; TONGA-NEXT: s_mov_b32 s11, s7
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s4, s2
-; TONGA-NEXT: s_mov_b32 s5, s3
-; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
-; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
-; TONGA-NEXT: s_mov_b32 s8, s0
-; TONGA-NEXT: s_mov_b32 s9, s1
+; TONGA-NEXT: s_mov_b32 s8, s2
+; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; TONGA-NEXT: s_mov_b32 s4, s0
+; TONGA-NEXT: s_mov_b32 s5, s1
; TONGA-NEXT: s_waitcnt vmcnt(1)
-; TONGA-NEXT: v_readfirstlane_b32 s0, v0
-; TONGA-NEXT: v_readfirstlane_b32 s1, v1
-; TONGA-NEXT: v_readfirstlane_b32 s2, v2
-; TONGA-NEXT: s_abs_i32 s13, s0
-; TONGA-NEXT: s_abs_i32 s14, s1
-; TONGA-NEXT: s_abs_i32 s15, s2
-; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s13
-; TONGA-NEXT: v_cvt_f32_u32_e32 v1, s14
-; TONGA-NEXT: v_cvt_f32_u32_e32 v2, s15
-; TONGA-NEXT: v_readfirstlane_b32 s6, v3
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0
+; TONGA-NEXT: s_waitcnt vmcnt(0)
+; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4
+; TONGA-NEXT: v_xor_b32_e32 v8, v0, v4
+; TONGA-NEXT: v_max_i32_e32 v4, v4, v10
+; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v4
+; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v5
+; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10
+; TONGA-NEXT: v_max_i32_e32 v5, v5, v13
+; TONGA-NEXT: v_cvt_f32_u32_e32 v13, v5
+; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v4
+; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10
+; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v13, v13
+; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1
+; TONGA-NEXT: v_mul_lo_u32 v16, v16, v10
+; TONGA-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13
+; TONGA-NEXT: v_cvt_u32_f32_e32 v13, v13
+; TONGA-NEXT: v_max_i32_e32 v0, v0, v9
+; TONGA-NEXT: v_mul_hi_u32 v16, v10, v16
+; TONGA-NEXT: v_max_i32_e32 v1, v1, v12
+; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v6
+; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v16
+; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v5
+; TONGA-NEXT: v_mul_lo_u32 v16, v16, v13
+; TONGA-NEXT: v_mul_hi_u32 v10, v0, v10
+; TONGA-NEXT: v_xor_b32_e32 v14, v2, v6
+; TONGA-NEXT: v_max_i32_e32 v6, v6, v15
+; TONGA-NEXT: v_mul_hi_u32 v12, v13, v16
+; TONGA-NEXT: v_cvt_f32_u32_e32 v15, v6
+; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8
+; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v11
+; TONGA-NEXT: v_add_u32_e32 v12, vcc, v13, v12
+; TONGA-NEXT: v_mul_lo_u32 v13, v10, v4
+; TONGA-NEXT: v_mul_hi_u32 v12, v1, v12
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v15
+; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v14
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v13
+; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
+; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1]
+; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v0, v4
+; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v13, s[0:1]
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
+; TONGA-NEXT: v_mul_lo_u32 v0, v12, v5
+; TONGA-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9
+; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9
+; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v6
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0
+; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v12
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
+; TONGA-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3]
+; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v5
+; TONGA-NEXT: v_mul_lo_u32 v4, v4, v9
+; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3]
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, 0, v7
+; TONGA-NEXT: v_max_i32_e32 v5, v7, v0
+; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v5
+; TONGA-NEXT: v_mul_hi_u32 v4, v9, v4
+; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10
; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; TONGA-NEXT: s_abs_i32 s17, s6
-; TONGA-NEXT: v_cvt_f32_u32_e32 v3, s17
+; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2
+; TONGA-NEXT: v_max_i32_e32 v2, v2, v9
+; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4
; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; TONGA-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; TONGA-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0
-; TONGA-NEXT: v_cvt_u32_f32_e32 v1, v1
-; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2
-; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_readfirstlane_b32 s3, v4
-; TONGA-NEXT: v_readfirstlane_b32 s4, v5
-; TONGA-NEXT: v_readfirstlane_b32 s5, v6
-; TONGA-NEXT: s_xor_b32 s12, s3, s0
-; TONGA-NEXT: s_xor_b32 s0, s4, s1
-; TONGA-NEXT: s_xor_b32 s1, s5, s2
-; TONGA-NEXT: s_sub_i32 s2, 0, s13
-; TONGA-NEXT: s_ashr_i32 s18, s0, 31
-; TONGA-NEXT: s_sub_i32 s0, 0, s14
-; TONGA-NEXT: s_ashr_i32 s19, s1, 31
-; TONGA-NEXT: s_sub_i32 s1, 0, s15
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; TONGA-NEXT: v_mul_lo_u32 v4, s2, v0
-; TONGA-NEXT: v_mul_lo_u32 v5, s0, v1
-; TONGA-NEXT: v_mul_lo_u32 v6, s1, v2
-; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3
-; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4
-; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5
-; TONGA-NEXT: v_mul_hi_u32 v6, v2, v6
-; TONGA-NEXT: s_sub_i32 s20, 0, s17
-; TONGA-NEXT: v_readfirstlane_b32 s7, v7
-; TONGA-NEXT: s_abs_i32 s3, s3
-; TONGA-NEXT: s_abs_i32 s4, s4
-; TONGA-NEXT: s_abs_i32 s5, s5
-; TONGA-NEXT: v_mul_lo_u32 v7, s20, v3
-; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4
-; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; TONGA-NEXT: v_mul_hi_u32 v0, s3, v0
-; TONGA-NEXT: v_mul_hi_u32 v1, s4, v1
-; TONGA-NEXT: v_mul_hi_u32 v2, s5, v2
-; TONGA-NEXT: v_mul_hi_u32 v7, v3, v7
-; TONGA-NEXT: v_mul_lo_u32 v4, v0, s13
-; TONGA-NEXT: v_mul_lo_u32 v6, v1, s14
-; TONGA-NEXT: v_mul_lo_u32 v8, v2, s15
-; TONGA-NEXT: s_abs_i32 s16, s7
-; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7
-; TONGA-NEXT: v_mul_hi_u32 v3, s16, v3
-; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s3, v4
-; TONGA-NEXT: v_sub_u32_e32 v6, vcc, s4, v6
-; TONGA-NEXT: v_sub_u32_e32 v8, vcc, s5, v8
-; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v0
-; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v1
-; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v2
-; TONGA-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4
-; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v6
-; TONGA-NEXT: v_cmp_le_u32_e64 s[4:5], s15, v8
-; TONGA-NEXT: v_subrev_u32_e32 v10, vcc, s13, v4
-; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1]
-; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, s14, v6
-; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3]
-; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, s15, v8
-; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5]
-; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1]
-; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v0
-; TONGA-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3]
-; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v1
-; TONGA-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[4:5]
-; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v2
-; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s13, v4
-; TONGA-NEXT: v_mul_lo_u32 v4, v3, s17
-; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
-; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s14, v5
-; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
-; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s15, v7
-; TONGA-NEXT: s_ashr_i32 s12, s12, 31
-; TONGA-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; TONGA-NEXT: v_xor_b32_e32 v0, s12, v0
-; TONGA-NEXT: v_xor_b32_e32 v1, s18, v1
-; TONGA-NEXT: v_xor_b32_e32 v2, s19, v2
-; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s16, v4
-; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, s12, v0
-; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s18, v1
-; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, s19, v2
-; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3
-; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, s17, v4
-; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s17, v4
-; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3
-; TONGA-NEXT: s_xor_b32 s0, s7, s6
-; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s17, v4
-; TONGA-NEXT: s_ashr_i32 s0, s0, 31
-; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; TONGA-NEXT: v_xor_b32_e32 v3, s0, v3
-; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, s0, v3
-; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v0
+; TONGA-NEXT: v_cndmask_b32_e64 v0, v10, v13, s[0:1]
+; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8
+; TONGA-NEXT: v_mul_lo_u32 v8, v4, v6
+; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v1
+; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v5
+; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v8
+; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3]
+; TONGA-NEXT: v_mul_lo_u32 v10, v10, v9
+; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6
+; TONGA-NEXT: v_xor_b32_e32 v1, v1, v11
+; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v2, v6
+; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v11
+; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
+; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
+; TONGA-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc
+; TONGA-NEXT: v_mul_hi_u32 v4, v9, v10
+; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v3
+; TONGA-NEXT: v_max_i32_e32 v6, v3, v6
+; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4
+; TONGA-NEXT: v_mul_hi_u32 v4, v6, v4
+; TONGA-NEXT: v_xor_b32_e32 v2, v2, v14
+; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v14
+; TONGA-NEXT: v_mul_lo_u32 v8, v4, v5
+; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7
+; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4
+; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v8
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v6, v5
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
+; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; TONGA-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
+; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v3
+; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; TONGA-NEXT: v_xor_b32_e32 v4, v4, v3
+; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v4, v3
+; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v4i32:
@@ -2006,7 +1994,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_mul_lo_u32 v1, v3, v2
; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3
; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v1
-; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v2, v1
+; GCN-NEXT: v_sub_i32_e32 v5, vcc, v1, v2
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -2014,7 +2002,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GCN-NEXT: v_xor_b32_e32 v1, v1, v0
-; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
@@ -2053,7 +2041,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2
; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3
; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v5, v1
-; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, v2, v1
+; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v1, v2
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -2061,7 +2049,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0
-; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v0, v1
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0
; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25
; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; TONGA-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index bbdfc76..da454ee 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -852,19 +852,19 @@ define amdgpu_kernel void @select_v2f16(
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
-; GFX11-TRUE16-NEXT: s_mov_b32 s26, s2
-; GFX11-TRUE16-NEXT: s_mov_b32 s27, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s26, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s27, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s6, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s7, s3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s20, s12
; GFX11-TRUE16-NEXT: s_mov_b32 s21, s13
-; GFX11-TRUE16-NEXT: s_mov_b32 s24, s14
-; GFX11-TRUE16-NEXT: s_mov_b32 s25, s15
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s11
+; GFX11-TRUE16-NEXT: s_mov_b32 s24, s14
+; GFX11-TRUE16-NEXT: s_mov_b32 s25, s15
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[20:23], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[24:27], 0
@@ -874,20 +874,18 @@ define amdgpu_kernel void @select_v2f16(
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v5.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v7.l, v1.l, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -1058,21 +1056,18 @@ define amdgpu_kernel void @select_v2f16_imm_a(
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, 0x3900, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v1.l, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -1236,21 +1231,18 @@ define amdgpu_kernel void @select_v2f16_imm_b(
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0x3900, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v1.l, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -1402,8 +1394,6 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
-; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
-; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9
@@ -1411,10 +1401,10 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10
-; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s10
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s11
; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5
-; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1425,12 +1415,9 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e64 s0, v4.l, v3.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3900, v0.l, s0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x3900, v0.l, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: select_v2f16_imm_c:
@@ -1581,8 +1568,6 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
-; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
-; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9
@@ -1590,10 +1575,10 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10
-; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s10
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s11
; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5
-; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1604,12 +1589,9 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v4.l, v3.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3900, v0.l, s0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x3900, v0.l, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: select_v2f16_imm_d:
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index 5944342..bbd1793 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -467,28 +467,28 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_readfirstlane_b32 s2, v2
; GCN-NEXT: s_abs_i32 s2, s2
; GCN-NEXT: v_cvt_f32_u32_e32 v2, s2
-; GCN-NEXT: v_readfirstlane_b32 s3, v0
+; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: s_sub_i32 s6, 0, s2
-; GCN-NEXT: s_ashr_i32 s5, s3, 31
+; GCN-NEXT: s_ashr_i32 s5, s4, 31
; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT: s_abs_i32 s3, s3
-; GCN-NEXT: v_readfirstlane_b32 s4, v3
+; GCN-NEXT: s_abs_i32 s4, s4
+; GCN-NEXT: v_readfirstlane_b32 s3, v3
; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_readfirstlane_b32 s7, v0
; GCN-NEXT: s_mul_i32 s6, s6, s7
; GCN-NEXT: s_mul_hi_u32 s6, s7, s6
; GCN-NEXT: s_add_i32 s7, s7, s6
-; GCN-NEXT: s_mul_hi_u32 s6, s3, s7
+; GCN-NEXT: s_mul_hi_u32 s6, s4, s7
; GCN-NEXT: s_mul_i32 s6, s6, s2
-; GCN-NEXT: s_sub_i32 s3, s3, s6
-; GCN-NEXT: s_sub_i32 s6, s3, s2
-; GCN-NEXT: s_cmp_ge_u32 s3, s2
-; GCN-NEXT: s_cselect_b32 s3, s6, s3
-; GCN-NEXT: s_sub_i32 s6, s3, s2
-; GCN-NEXT: s_cmp_ge_u32 s3, s2
-; GCN-NEXT: s_cselect_b32 s2, s6, s3
-; GCN-NEXT: s_abs_i32 s3, s4
+; GCN-NEXT: s_sub_i32 s4, s4, s6
+; GCN-NEXT: s_sub_i32 s6, s4, s2
+; GCN-NEXT: s_cmp_ge_u32 s4, s2
+; GCN-NEXT: s_cselect_b32 s4, s6, s4
+; GCN-NEXT: s_sub_i32 s6, s4, s2
+; GCN-NEXT: s_cmp_ge_u32 s4, s2
+; GCN-NEXT: s_cselect_b32 s2, s6, s4
+; GCN-NEXT: s_abs_i32 s3, s3
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3
; GCN-NEXT: s_xor_b32 s2, s2, s5
; GCN-NEXT: s_sub_i32 s7, 0, s3
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
index 4a6202ea..6daea57 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
@@ -788,12 +788,10 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
;
; GFX11-SDAG-TRUE16-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e64 v0.l, s2, s3
; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s0, s3, 16
; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s1, s2, 16
-; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e64 v1.l, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e64 v0.l, s2, s3
+; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e64 v0.h, s1, s0
; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-SDAG-FAKE16-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index cd1c532..6a273e5 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -813,7 +813,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -825,11 +825,9 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: v_pk_sub_i16 v0, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.h
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1
; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index c9b94e0..99b6ab7 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -189,14 +189,11 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin_sgpr:
; SDAG-GFX11-TRUE16: ; %bb.0:
; SDAG-GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, s2, 0, 0xff
-; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v1.l, s3, 0, 0xff
-; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SDAG-GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; SDAG-GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v1.l, s2, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v1.h, s3, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; SDAG-GFX11-TRUE16-NEXT: s_endpgm
;
; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin_sgpr:
@@ -215,14 +212,11 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_sgpr:
; SDAG-GFX12-TRUE16: ; %bb.0:
; SDAG-GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; SDAG-GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, s2, 0, 0xff
-; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v1.l, s3, 0, 0xff
-; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SDAG-GFX12-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; SDAG-GFX12-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
+; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v1.l, s2, 0, 0xff
+; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v1.h, s3, 0, 0xff
+; SDAG-GFX12-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; SDAG-GFX12-TRUE16-NEXT: s_endpgm
;
; SDAG-GFX12-FAKE16-LABEL: basic_smax_smin_sgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
index 801324e..dfc59f6 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
@@ -1023,10 +1023,11 @@ define i16 @test_vector_reduce_and_v2i16(<2 x i16> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v2i16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v2
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v2i16:
@@ -1052,10 +1053,11 @@ define i16 @test_vector_reduce_and_v2i16(<2 x i16> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v2
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
index 98919f5..4d5ade4 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
@@ -1024,10 +1024,11 @@ define i16 @test_vector_reduce_mul_v2i16(<2 x i16> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v2i16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v2i16:
@@ -1053,10 +1054,11 @@ define i16 @test_vector_reduce_mul_v2i16(<2 x i16> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
-; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v2i16:
@@ -1298,11 +1300,12 @@ define i16 @test_vector_reduce_mul_v4i16(<4 x i16> %v) {
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v4i16:
@@ -1331,11 +1334,12 @@ define i16 @test_vector_reduce_mul_v4i16(<4 x i16> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v4i16:
@@ -1468,12 +1472,13 @@ define i16 @test_vector_reduce_mul_v8i16(<8 x i16> %v) {
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v1, v3
; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v8i16:
@@ -1509,12 +1514,13 @@ define i16 @test_vector_reduce_mul_v8i16(<8 x i16> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v1, v3
; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v8i16:
@@ -1706,12 +1712,13 @@ define i16 @test_vector_reduce_mul_v16i16(<16 x i16> %v) {
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v1, v3
; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v16i16:
@@ -1762,12 +1769,13 @@ define i16 @test_vector_reduce_mul_v16i16(<16 x i16> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v1, v3
; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v16i16:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
index bdb1c22..9e033f5 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
@@ -1046,10 +1046,11 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v2i16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v2i16:
@@ -1075,10 +1076,11 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
-; GFX12-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
index cf344ea..166e6c4 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
@@ -992,10 +992,11 @@ define i16 @test_vector_reduce_xor_v2i16(<2 x i16> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v2i16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v2i16:
@@ -1021,10 +1022,11 @@ define i16 @test_vector_reduce_xor_v2i16(<2 x i16> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
-; GFX12-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
index 07e9325..5045540 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
@@ -455,10 +455,7 @@ define <2 x i16> @shuffle_v2i16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v2i16_rebroadcast:
@@ -499,10 +496,8 @@ define <4 x i16> @shuffle_v4i16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -550,10 +545,8 @@ define <8 x i16> @shuffle_v8i16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
@@ -613,10 +606,8 @@ define <16 x i16> @shuffle_v16i16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
@@ -700,10 +691,8 @@ define <32 x i16> @shuffle_v32i16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index b01e92d..6bf6d54 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1288,9 +1288,8 @@ define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %a
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -2571,10 +2570,9 @@ define <2 x i16> @i16_hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: i16_hi16low16bits:
@@ -2626,14 +2624,10 @@ define <2 x i16> @i16_hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX11-TRUE16-LABEL: i16_hi16bits:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off
-; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: i16_hi16bits:
diff --git a/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll
index 153ca10..72f10ae 100644
--- a/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll
@@ -1141,29 +1141,88 @@ define <2 x i32> @test_select_cc(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
ret <2 x i32> %r
}
-define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 {
-; CHECK-NOI32X2-LABEL: test_trunc_2xi32(
+define <2 x i16> @test_trunc_2xi32_to_2xi16(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_trunc_2xi32_to_2xi16(
; CHECK-NOI32X2: {
; CHECK-NOI32X2-NEXT: .reg .b32 %r<4>;
; CHECK-NOI32X2-EMPTY:
; CHECK-NOI32X2-NEXT: // %bb.0:
-; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_param_0];
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_to_2xi16_param_0];
; CHECK-NOI32X2-NEXT: prmt.b32 %r3, %r1, %r2, 0x5410U;
; CHECK-NOI32X2-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-NOI32X2-NEXT: ret;
;
-; CHECK-I32X2-LABEL: test_trunc_2xi32(
+; CHECK-I32X2-LABEL: test_trunc_2xi32_to_2xi16(
; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b32 %r<4>;
; CHECK-I32X2-NEXT: .reg .b64 %rd<2>;
; CHECK-I32X2-EMPTY:
; CHECK-I32X2-NEXT: // %bb.0:
-; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_trunc_2xi32_param_0];
-; CHECK-I32X2-NEXT: st.param.b32 [func_retval0], %rd1;
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_trunc_2xi32_to_2xi16_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT: prmt.b32 %r3, %r1, %r2, 0x5410U;
+; CHECK-I32X2-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-I32X2-NEXT: ret;
%r = trunc <2 x i32> %a to <2 x i16>
ret <2 x i16> %r
}
+define <2 x i8> @test_trunc_2xi32_to_2xi8(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_trunc_2xi32_to_2xi8(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_to_2xi8_param_0];
+; CHECK-NOI32X2-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-NOI32X2-NEXT: cvt.u16.u32 %rs2, %r1;
+; CHECK-NOI32X2-NEXT: st.param.v2.b8 [func_retval0], {%rs2, %rs1};
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_trunc_2xi32_to_2xi8(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-I32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_trunc_2xi32_to_2xi8_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-I32X2-NEXT: cvt.u16.u32 %rs2, %r1;
+; CHECK-I32X2-NEXT: st.param.v2.b8 [func_retval0], {%rs2, %rs1};
+; CHECK-I32X2-NEXT: ret;
+ %r = trunc <2 x i32> %a to <2 x i8>
+ ret <2 x i8> %r
+}
+
+define <2 x i1> @test_trunc_2xi32_to_2xi1(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_trunc_2xi32_to_2xi1(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_to_2xi1_param_0];
+; CHECK-NOI32X2-NEXT: st.param.b8 [func_retval0], %r1;
+; CHECK-NOI32X2-NEXT: st.param.b8 [func_retval0+1], %r2;
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_trunc_2xi32_to_2xi1(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_trunc_2xi32_to_2xi1_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT: st.param.b8 [func_retval0], %r1;
+; CHECK-I32X2-NEXT: st.param.b8 [func_retval0+1], %r2;
+; CHECK-I32X2-NEXT: ret;
+ %r = trunc <2 x i32> %a to <2 x i1>
+ ret <2 x i1> %r
+}
+
define <2 x i32> @test_trunc_2xi64(<2 x i64> %a) #0 {
; CHECK-LABEL: test_trunc_2xi64(
; CHECK: {
@@ -1180,14 +1239,14 @@ define <2 x i32> @test_trunc_2xi64(<2 x i64> %a) #0 {
ret <2 x i32> %r
}
-define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 {
-; CHECK-LABEL: test_zext_2xi32(
+define <2 x i32> @test_zext_2xi16_to_2xi32(<2 x i16> %a) #0 {
+; CHECK-LABEL: test_zext_2xi16_to_2xi32(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b32 %r1, [test_zext_2xi32_param_0];
+; CHECK-NEXT: ld.param.b32 %r1, [test_zext_2xi16_to_2xi32_param_0];
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
; CHECK-NEXT: cvt.u32.u16 %r2, %rs2;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs1;
@@ -1197,6 +1256,47 @@ define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 {
ret <2 x i32> %r
}
+define <2 x i32> @test_zext_2xi8_to_2xi32(<2 x i8> %a) #0 {
+; CHECK-LABEL: test_zext_2xi8_to_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_zext_2xi8_to_2xi32_param_0];
+; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2};
+; CHECK-NEXT: ret;
+ %r = zext <2 x i8> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
+define <2 x i32> @test_zext_2xi1_to_2xi32(<2 x i1> %a) #0 {
+; CHECK-LABEL: test_zext_2xi1_to_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b8 %rs1, [test_zext_2xi1_to_2xi32_param_0+1];
+; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT: setp.ne.b16 %p2, %rs2, 0;
+; CHECK-NEXT: ld.param.b8 %rs3, [test_zext_2xi1_to_2xi32_param_0];
+; CHECK-NEXT: and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT: setp.ne.b16 %p1, %rs4, 0;
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: and.b32 %r2, %r1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs3;
+; CHECK-NEXT: and.b32 %r4, %r3, 1;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r2};
+; CHECK-NEXT: ret;
+ %r = zext <2 x i1> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
define <2 x i64> @test_zext_2xi64(<2 x i32> %a) #0 {
; CHECK-NOI32X2-LABEL: test_zext_2xi64(
; CHECK-NOI32X2: {
@@ -1566,6 +1666,55 @@ entry:
ret void
}
+define <2 x i32> @test_sext_v2i8_to_v2i32 (<2 x i8> %a) {
+; CHECK-LABEL: test_sext_v2i8_to_v2i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_sext_v2i8_to_v2i32_param_0];
+; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT: cvt.s32.s8 %r3, %r2;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs1;
+; CHECK-NEXT: cvt.s32.s8 %r5, %r4;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r5, %r3};
+; CHECK-NEXT: ret;
+ %r = sext <2 x i8> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
+define <2 x i32> @test_sext_v2i16_to_v2i32 (<2 x i16> %a) {
+; CHECK-NOI32X2-LABEL: test_sext_v2i16_to_v2i32(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .b16 %rs<2>;
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<4>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.b32 %r1, [test_sext_v2i16_to_v2i32_param_0];
+; CHECK-NOI32X2-NEXT: cvt.s32.s16 %r2, %r1;
+; CHECK-NOI32X2-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
+; CHECK-NOI32X2-NEXT: cvt.s32.s16 %r3, %rs1;
+; CHECK-NOI32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r3};
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_sext_v2i16_to_v2i32(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b16 %rs<2>;
+; CHECK-I32X2-NEXT: .reg .b32 %r<4>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b32 %r1, [test_sext_v2i16_to_v2i32_param_0];
+; CHECK-I32X2-NEXT: cvt.s32.s16 %r2, %r1;
+; CHECK-I32X2-NEXT: mov.b32 {_, %rs1}, %r1;
+; CHECK-I32X2-NEXT: cvt.s32.s16 %r3, %rs1;
+; CHECK-I32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r3};
+; CHECK-I32X2-NEXT: ret;
+ %r = sext <2 x i16> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
define <2 x float> @test_uitofp_v2i32(<2 x i32> %a) {
; CHECK-NOI32X2-LABEL: test_uitofp_v2i32(
; CHECK-NOI32X2: {
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index f3529b1..22c2d81 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -80,6 +80,7 @@
; RUN: llc -mtriple=riscv32 -mattr=+xwchc %s -o - | FileCheck --check-prefix=RV32XWCHC %s
; RUN: llc -mtriple=riscv32 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV32ZAAMO %s
; RUN: llc -mtriple=riscv32 -mattr=+zalrsc %s -o - | FileCheck --check-prefix=RV32ZALRSC %s
+; RUN: llc -mtriple=riscv32 -mattr=+zaamo,+zalrsc %s -o - | FileCheck --check-prefixes=CHECK,RV32COMBINEINTOA %s
; RUN: llc -mtriple=riscv32 -mattr=+zca %s -o - | FileCheck --check-prefixes=CHECK,RV32ZCA %s
; RUN: llc -mtriple=riscv32 -mattr=+zcb %s -o - | FileCheck --check-prefixes=CHECK,RV32ZCB %s
; RUN: llc -mtriple=riscv32 -mattr=+zcd %s -o - | FileCheck --check-prefixes=CHECK,RV32ZCD %s
@@ -227,6 +228,7 @@
; RUN: llc -mtriple=riscv64 -mattr=+ztso %s -o - | FileCheck --check-prefixes=CHECK,RV64ZTSO %s
; RUN: llc -mtriple=riscv64 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV64ZAAMO %s
; RUN: llc -mtriple=riscv64 -mattr=+zalrsc %s -o - | FileCheck --check-prefix=RV64ZALRSC %s
+; RUN: llc -mtriple=riscv64 -mattr=+zaamo,+zalrsc %s -o - | FileCheck --check-prefixes=CHECK,RV64COMBINEINTOA %s
; RUN: llc -mtriple=riscv64 -mattr=+zca %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCA %s
; RUN: llc -mtriple=riscv64 -mattr=+zcb %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCB %s
; RUN: llc -mtriple=riscv64 -mattr=+zcd %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCD %s
@@ -392,6 +394,7 @@
; RV32XWCHC: .attribute 5, "rv32i2p1_zca1p0_xwchc2p2"
; RV32ZAAMO: .attribute 5, "rv32i2p1_zaamo1p0"
; RV32ZALRSC: .attribute 5, "rv32i2p1_zalrsc1p0"
+; RV32COMBINEINTOA: .attribute 5, "rv32i2p1_a2p1_zaamo1p0_zalrsc1p0"
; RV32ZCA: .attribute 5, "rv32i2p1_zca1p0"
; RV32ZCB: .attribute 5, "rv32i2p1_zca1p0_zcb1p0"
; RV32ZCD: .attribute 5, "rv32i2p1_f2p2_d2p2_zicsr2p0_zca1p0_zcd1p0"
@@ -537,6 +540,7 @@
; RV64ZTSO: .attribute 5, "rv64i2p1_ztso1p0"
; RV64ZAAMO: .attribute 5, "rv64i2p1_zaamo1p0"
; RV64ZALRSC: .attribute 5, "rv64i2p1_zalrsc1p0"
+; RV64COMBINEINTOA: .attribute 5, "rv64i2p1_a2p1_zaamo1p0_zalrsc1p0"
; RV64ZCA: .attribute 5, "rv64i2p1_zca1p0"
; RV64ZCB: .attribute 5, "rv64i2p1_zca1p0_zcb1p0"
; RV64ZCD: .attribute 5, "rv64i2p1_f2p2_d2p2_zicsr2p0_zca1p0_zcd1p0"
diff --git a/llvm/test/CodeGen/RISCV/div_minsize.ll b/llvm/test/CodeGen/RISCV/div_minsize.ll
index 601821b..794af2f 100644
--- a/llvm/test/CodeGen/RISCV/div_minsize.ll
+++ b/llvm/test/CodeGen/RISCV/div_minsize.ll
@@ -68,3 +68,151 @@ define i32 @testsize4(i32 %x) minsize nounwind {
%div = udiv i32 %x, 33
ret i32 %div
}
+
+define i128 @i128_sdiv(i128 %arg0) minsize nounwind {
+; RV32IM-LABEL: i128_sdiv:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: lw a2, 12(a1)
+; RV32IM-NEXT: lw a3, 8(a1)
+; RV32IM-NEXT: lw a4, 0(a1)
+; RV32IM-NEXT: lw a1, 4(a1)
+; RV32IM-NEXT: srai a5, a2, 31
+; RV32IM-NEXT: srli a5, a5, 30
+; RV32IM-NEXT: add a5, a4, a5
+; RV32IM-NEXT: sltu a4, a5, a4
+; RV32IM-NEXT: srli a5, a5, 2
+; RV32IM-NEXT: add a6, a1, a4
+; RV32IM-NEXT: sltu a1, a6, a1
+; RV32IM-NEXT: and a1, a4, a1
+; RV32IM-NEXT: srli a4, a6, 2
+; RV32IM-NEXT: slli a6, a6, 30
+; RV32IM-NEXT: or a5, a5, a6
+; RV32IM-NEXT: add a1, a3, a1
+; RV32IM-NEXT: srli a6, a1, 2
+; RV32IM-NEXT: sltu a3, a1, a3
+; RV32IM-NEXT: slli a1, a1, 30
+; RV32IM-NEXT: add a2, a2, a3
+; RV32IM-NEXT: or a1, a4, a1
+; RV32IM-NEXT: slli a3, a2, 30
+; RV32IM-NEXT: srai a2, a2, 2
+; RV32IM-NEXT: or a3, a6, a3
+; RV32IM-NEXT: sw a5, 0(a0)
+; RV32IM-NEXT: sw a1, 4(a0)
+; RV32IM-NEXT: sw a3, 8(a0)
+; RV32IM-NEXT: sw a2, 12(a0)
+; RV32IM-NEXT: ret
+;
+; RV64IM-LABEL: i128_sdiv:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: addi sp, sp, -16
+; RV64IM-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: li a2, 4
+; RV64IM-NEXT: li a3, 0
+; RV64IM-NEXT: call __divti3
+; RV64IM-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: addi sp, sp, 16
+; RV64IM-NEXT: ret
+ %div = sdiv i128 %arg0, 4
+ ret i128 %div
+}
+
+define i256 @i256_sdiv(i256 %arg0) minsize nounwind {
+; RV32IM-LABEL: i256_sdiv:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: lw a5, 16(a1)
+; RV32IM-NEXT: lw a4, 20(a1)
+; RV32IM-NEXT: lw a2, 24(a1)
+; RV32IM-NEXT: lw a3, 28(a1)
+; RV32IM-NEXT: lw a6, 0(a1)
+; RV32IM-NEXT: lw a7, 4(a1)
+; RV32IM-NEXT: lw t0, 8(a1)
+; RV32IM-NEXT: lw t1, 12(a1)
+; RV32IM-NEXT: srai a1, a3, 31
+; RV32IM-NEXT: srli a1, a1, 30
+; RV32IM-NEXT: add a1, a6, a1
+; RV32IM-NEXT: sltu t2, a1, a6
+; RV32IM-NEXT: add a6, a7, t2
+; RV32IM-NEXT: sltu a7, a6, a7
+; RV32IM-NEXT: and t2, t2, a7
+; RV32IM-NEXT: add a7, t0, t2
+; RV32IM-NEXT: sltu t3, a7, t0
+; RV32IM-NEXT: add t0, t1, t3
+; RV32IM-NEXT: beqz t2, .LBB5_2
+; RV32IM-NEXT: # %bb.1:
+; RV32IM-NEXT: sltu t1, t0, t1
+; RV32IM-NEXT: and t2, t3, t1
+; RV32IM-NEXT: .LBB5_2:
+; RV32IM-NEXT: add t2, a5, t2
+; RV32IM-NEXT: srli t1, t0, 2
+; RV32IM-NEXT: srli t3, a7, 2
+; RV32IM-NEXT: slli t0, t0, 30
+; RV32IM-NEXT: slli a7, a7, 30
+; RV32IM-NEXT: or t0, t3, t0
+; RV32IM-NEXT: srli t3, a6, 2
+; RV32IM-NEXT: srli a1, a1, 2
+; RV32IM-NEXT: slli a6, a6, 30
+; RV32IM-NEXT: sltu a5, t2, a5
+; RV32IM-NEXT: or a7, t3, a7
+; RV32IM-NEXT: srli t3, t2, 2
+; RV32IM-NEXT: slli t2, t2, 30
+; RV32IM-NEXT: or a1, a1, a6
+; RV32IM-NEXT: add a6, a4, a5
+; RV32IM-NEXT: or t1, t1, t2
+; RV32IM-NEXT: sltu a4, a6, a4
+; RV32IM-NEXT: srli t2, a6, 2
+; RV32IM-NEXT: slli a6, a6, 30
+; RV32IM-NEXT: sw a1, 0(a0)
+; RV32IM-NEXT: sw a7, 4(a0)
+; RV32IM-NEXT: sw t0, 8(a0)
+; RV32IM-NEXT: sw t1, 12(a0)
+; RV32IM-NEXT: and a4, a5, a4
+; RV32IM-NEXT: or a1, t3, a6
+; RV32IM-NEXT: add a4, a2, a4
+; RV32IM-NEXT: srli a5, a4, 2
+; RV32IM-NEXT: sltu a2, a4, a2
+; RV32IM-NEXT: slli a4, a4, 30
+; RV32IM-NEXT: add a2, a3, a2
+; RV32IM-NEXT: or a3, t2, a4
+; RV32IM-NEXT: slli a4, a2, 30
+; RV32IM-NEXT: srai a2, a2, 2
+; RV32IM-NEXT: or a4, a5, a4
+; RV32IM-NEXT: sw a1, 16(a0)
+; RV32IM-NEXT: sw a3, 20(a0)
+; RV32IM-NEXT: sw a4, 24(a0)
+; RV32IM-NEXT: sw a2, 28(a0)
+; RV32IM-NEXT: ret
+;
+; RV64IM-LABEL: i256_sdiv:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: ld a2, 24(a1)
+; RV64IM-NEXT: ld a3, 16(a1)
+; RV64IM-NEXT: ld a4, 0(a1)
+; RV64IM-NEXT: ld a1, 8(a1)
+; RV64IM-NEXT: srai a5, a2, 63
+; RV64IM-NEXT: srli a5, a5, 62
+; RV64IM-NEXT: add a5, a4, a5
+; RV64IM-NEXT: sltu a4, a5, a4
+; RV64IM-NEXT: srli a5, a5, 2
+; RV64IM-NEXT: add a6, a1, a4
+; RV64IM-NEXT: sltu a1, a6, a1
+; RV64IM-NEXT: and a1, a4, a1
+; RV64IM-NEXT: srli a4, a6, 2
+; RV64IM-NEXT: slli a6, a6, 62
+; RV64IM-NEXT: or a5, a5, a6
+; RV64IM-NEXT: add a1, a3, a1
+; RV64IM-NEXT: srli a6, a1, 2
+; RV64IM-NEXT: sltu a3, a1, a3
+; RV64IM-NEXT: slli a1, a1, 62
+; RV64IM-NEXT: add a2, a2, a3
+; RV64IM-NEXT: or a1, a4, a1
+; RV64IM-NEXT: slli a3, a2, 62
+; RV64IM-NEXT: srai a2, a2, 2
+; RV64IM-NEXT: or a3, a6, a3
+; RV64IM-NEXT: sd a5, 0(a0)
+; RV64IM-NEXT: sd a1, 8(a0)
+; RV64IM-NEXT: sd a3, 16(a0)
+; RV64IM-NEXT: sd a2, 24(a0)
+; RV64IM-NEXT: ret
+ %div = sdiv i256 %arg0, 4
+ ret i256 %div
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/mixed-float-bf16-arith.ll b/llvm/test/CodeGen/RISCV/rvv/mixed-float-bf16-arith.ll
new file mode 100644
index 0000000..489323b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/mixed-float-bf16-arith.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x half> @llvm.riscv.vfadd.nxv1f16.nxv1f16(
+ <vscale x 1 x half>,
+ <vscale x 1 x half>,
+ <vscale x 1 x half>,
+ iXLen, iXLen);
+
+declare <vscale x 1 x i32> @llvm.riscv.vadd.nxv1i32.nxv1i32(
+ <vscale x 1 x i32>,
+ <vscale x 1 x i32>,
+ <vscale x 1 x i32>,
+ iXLen);
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @test_half_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2, <vscale x 1 x half> %3, <vscale x 1 x half> %4, ptr %ptr) nounwind {
+; CHECK-LABEL: test_half_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a2, 0
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vfadd.vv v10, v10, v11
+; CHECK-NEXT: vsetvli zero, zero, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v9
+; CHECK-NEXT: fsrm a2
+; CHECK-NEXT: vse16.v v10, (a1)
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x half> @llvm.riscv.vfadd.nxv1f16.nxv1f16(
+ <vscale x 1 x half> poison,
+ <vscale x 1 x half> %3,
+ <vscale x 1 x half> %4,
+ iXLen 0, iXLen %2)
+
+ %b = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ call void @llvm.riscv.vse(<vscale x 1 x half> %a, ptr %ptr, iXLen %2)
+
+ ret <vscale x 1 x bfloat> %b
+}
+
+define <vscale x 1 x bfloat> @test_i32_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2, <vscale x 1 x i32> %3, <vscale x 1 x i32> %4, ptr %ptr) nounwind {
+; CHECK-LABEL: test_i32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vadd.vv v10, v10, v11
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vsetvli zero, zero, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v9
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vse32.v v10, (a1)
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i32> @llvm.riscv.vadd.nxv1i32.nxv1i32(
+ <vscale x 1 x i32> poison,
+ <vscale x 1 x i32> %3,
+ <vscale x 1 x i32> %4,
+ iXLen %2)
+
+ %b = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ call void @llvm.riscv.vse(<vscale x 1 x i32> %a, ptr %ptr, iXLen %2)
+
+ ret <vscale x 1 x bfloat> %b
+}
+
+define <vscale x 1 x bfloat> @test_half_bf16_half(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2, <vscale x 1 x half> %3, <vscale x 1 x half> %4, ptr %ptr) nounwind {
+; CHECK-LABEL: test_half_bf16_half:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a2, 0
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vfadd.vv v10, v10, v11
+; CHECK-NEXT: vsetvli zero, zero, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v10, v11
+; CHECK-NEXT: fsrm a2
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vse16.v v9, (a1)
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x half> @llvm.riscv.vfadd.nxv1f16.nxv1f16(
+ <vscale x 1 x half> poison,
+ <vscale x 1 x half> %3,
+ <vscale x 1 x half> %4,
+ iXLen 0, iXLen %2)
+
+ %b = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ %c = call <vscale x 1 x half> @llvm.riscv.vfadd.nxv1f16.nxv1f16(
+ <vscale x 1 x half> poison,
+ <vscale x 1 x half> %a,
+ <vscale x 1 x half> %4,
+ iXLen 0, iXLen %2)
+
+ store <vscale x 1 x half> %c, ptr %ptr
+
+ ret <vscale x 1 x bfloat> %b
+}
+
+define <vscale x 1 x bfloat> @test_bf16_half_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2, <vscale x 1 x half> %3, <vscale x 1 x half> %4, ptr %ptr) nounwind {
+; CHECK-LABEL: test_bf16_half_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a2, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfadd.vv v10, v10, v11
+; CHECK-NEXT: vsetvli zero, zero, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v9
+; CHECK-NEXT: fsrm a2
+; CHECK-NEXT: vsetvli a0, zero, e16alt, mf4, ta, ma
+; CHECK-NEXT: vse16.v v10, (a1)
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ %b = call <vscale x 1 x half> @llvm.riscv.vfadd.nxv1f16.nxv1f16(
+ <vscale x 1 x half> poison,
+ <vscale x 1 x half> %3,
+ <vscale x 1 x half> %4,
+ iXLen 0, iXLen %2)
+
+ %c = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %a,
+ <vscale x 1 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ store <vscale x 1 x half> %b, ptr %ptr
+
+ ret <vscale x 1 x bfloat> %c
+}
+
+define <vscale x 1 x bfloat> @test_bf16_i16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2, <vscale x 1 x i16> %3, <vscale x 1 x i16> %4, ptr %ptr) nounwind {
+; CHECK-LABEL: test_bf16_i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a2, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v9
+; CHECK-NEXT: vadd.vv v9, v10, v11
+; CHECK-NEXT: fsrm a2
+; CHECK-NEXT: vsetvli a0, zero, e16alt, mf4, ta, ma
+; CHECK-NEXT: vse16.v v9, (a1)
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ %b = call <vscale x 1 x i16> @llvm.riscv.vadd.nxv1i16.nxv1i16(
+ <vscale x 1 x i16> poison,
+ <vscale x 1 x i16> %3,
+ <vscale x 1 x i16> %4,
+ iXLen %2)
+
+ store <vscale x 1 x i16> %b, ptr %ptr
+
+ ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-bf.ll
new file mode 100644
index 0000000..db1b081
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-bf.ll
@@ -0,0 +1,607 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfadd_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfadd_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfadd.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfadd_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfadd_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfadd.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfadd_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfadd_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfadd.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfadd_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfadd_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfadd.vv v8, v10, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfadd.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfadd_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfadd_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfadd.vv v8, v12, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfadd.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfadd_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v8, v8, v16
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfadd_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8re16.v v24, (a0)
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vsetvli zero, a1, e16alt, m8, ta, mu
+; CHECK-NEXT: vfadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x bfloat> %2,
+ <vscale x 32 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfadd_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfadd.vf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfadd_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfadd.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfadd.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfadd_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfadd.vf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfadd_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfadd.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfadd.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfadd_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfadd.vf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfadd_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfadd.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfadd.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfadd_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfadd.vf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfadd_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfadd.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfadd.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfadd_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfadd.vf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfadd_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfadd.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfadd.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfadd_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfadd.vf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ <vscale x 32 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfadd_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT: vfadd.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ bfloat %2,
+ <vscale x 32 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfclass-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfclass-bf.ll
new file mode 100644
index 0000000..d7d49b3
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfclass-bf.ll
@@ -0,0 +1,294 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i16> @llvm.riscv.vfclass.nxv1i16.nxv1bf16(
+ <vscale x 1 x i16>,
+ <vscale x 1 x bfloat>,
+ iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vfclass_v_nxv1i16_nxv1bf16(
+; CHECK-LABEL: intrinsic_vfclass_v_nxv1i16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfclass.v v8, v8
+; CHECK-NEXT: ret
+ <vscale x 1 x bfloat> %0,
+ iXLen %1) nounwind {
+entry:
+ %a = call <vscale x 1 x i16> @llvm.riscv.vfclass.nxv1i16.nxv1bf16(
+ <vscale x 1 x i16> poison,
+ <vscale x 1 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vfclass.mask.nxv1i16.nxv1bf16(
+ <vscale x 1 x i16>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vfclass_mask_v_nxv1i16_nxv1bf16(
+; CHECK-LABEL: intrinsic_vfclass_mask_v_nxv1i16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfclass.v v8, v9, v0.t
+; CHECK-NEXT: ret
+ <vscale x 1 x i16> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x i1> %2,
+ iXLen %3) nounwind {
+entry:
+ %a = call <vscale x 1 x i16> @llvm.riscv.vfclass.mask.nxv1i16.nxv1bf16(
+ <vscale x 1 x i16> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x i1> %2,
+ iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vfclass.nxv2i16.nxv2bf16(
+ <vscale x 2 x i16>,
+ <vscale x 2 x bfloat>,
+ iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vfclass_v_nxv2i16_nxv2bf16(
+; CHECK-LABEL: intrinsic_vfclass_v_nxv2i16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfclass.v v8, v8
+; CHECK-NEXT: ret
+ <vscale x 2 x bfloat> %0,
+ iXLen %1) nounwind {
+entry:
+ %a = call <vscale x 2 x i16> @llvm.riscv.vfclass.nxv2i16.nxv2bf16(
+ <vscale x 2 x i16> poison,
+ <vscale x 2 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vfclass.mask.nxv2i16.nxv2bf16(
+ <vscale x 2 x i16>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vfclass_mask_v_nxv2i16_nxv2bf16(
+; CHECK-LABEL: intrinsic_vfclass_mask_v_nxv2i16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfclass.v v8, v9, v0.t
+; CHECK-NEXT: ret
+ <vscale x 2 x i16> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x i1> %2,
+ iXLen %3) nounwind {
+entry:
+ %a = call <vscale x 2 x i16> @llvm.riscv.vfclass.mask.nxv2i16.nxv2bf16(
+ <vscale x 2 x i16> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x i1> %2,
+ iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vfclass.nxv4i16.nxv4bf16(
+ <vscale x 4 x i16>,
+ <vscale x 4 x bfloat>,
+ iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vfclass_v_nxv4i16_nxv4bf16(
+; CHECK-LABEL: intrinsic_vfclass_v_nxv4i16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfclass.v v8, v8
+; CHECK-NEXT: ret
+ <vscale x 4 x bfloat> %0,
+ iXLen %1) nounwind {
+entry:
+ %a = call <vscale x 4 x i16> @llvm.riscv.vfclass.nxv4i16.nxv4bf16(
+ <vscale x 4 x i16> poison,
+ <vscale x 4 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vfclass.mask.nxv4i16.nxv4bf16(
+ <vscale x 4 x i16>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vfclass_mask_v_nxv4i16_nxv4bf16(
+; CHECK-LABEL: intrinsic_vfclass_mask_v_nxv4i16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfclass.v v8, v9, v0.t
+; CHECK-NEXT: ret
+ <vscale x 4 x i16> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x i1> %2,
+ iXLen %3) nounwind {
+entry:
+ %a = call <vscale x 4 x i16> @llvm.riscv.vfclass.mask.nxv4i16.nxv4bf16(
+ <vscale x 4 x i16> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x i1> %2,
+ iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vfclass.nxv8i16.nxv8bf16(
+ <vscale x 8 x i16>,
+ <vscale x 8 x bfloat>,
+ iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vfclass_v_nxv8i16_nxv8bf16(
+; CHECK-LABEL: intrinsic_vfclass_v_nxv8i16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfclass.v v8, v8
+; CHECK-NEXT: ret
+ <vscale x 8 x bfloat> %0,
+ iXLen %1) nounwind {
+entry:
+ %a = call <vscale x 8 x i16> @llvm.riscv.vfclass.nxv8i16.nxv8bf16(
+ <vscale x 8 x i16> poison,
+ <vscale x 8 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vfclass.mask.nxv8i16.nxv8bf16(
+ <vscale x 8 x i16>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vfclass_mask_v_nxv8i16_nxv8bf16(
+; CHECK-LABEL: intrinsic_vfclass_mask_v_nxv8i16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfclass.v v8, v10, v0.t
+; CHECK-NEXT: ret
+ <vscale x 8 x i16> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x i1> %2,
+ iXLen %3) nounwind {
+entry:
+ %a = call <vscale x 8 x i16> @llvm.riscv.vfclass.mask.nxv8i16.nxv8bf16(
+ <vscale x 8 x i16> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x i1> %2,
+ iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vfclass.nxv16i16.nxv16bf16(
+ <vscale x 16 x i16>,
+ <vscale x 16 x bfloat>,
+ iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vfclass_v_nxv16i16_nxv16bf16(
+; CHECK-LABEL: intrinsic_vfclass_v_nxv16i16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfclass.v v8, v8
+; CHECK-NEXT: ret
+ <vscale x 16 x bfloat> %0,
+ iXLen %1) nounwind {
+entry:
+ %a = call <vscale x 16 x i16> @llvm.riscv.vfclass.nxv16i16.nxv16bf16(
+ <vscale x 16 x i16> poison,
+ <vscale x 16 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vfclass.mask.nxv16i16.nxv16bf16(
+ <vscale x 16 x i16>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vfclass_mask_v_nxv16i16_nxv16bf16(
+; CHECK-LABEL: intrinsic_vfclass_mask_v_nxv16i16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfclass.v v8, v12, v0.t
+; CHECK-NEXT: ret
+ <vscale x 16 x i16> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x i1> %2,
+ iXLen %3) nounwind {
+entry:
+ %a = call <vscale x 16 x i16> @llvm.riscv.vfclass.mask.nxv16i16.nxv16bf16(
+ <vscale x 16 x i16> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x i1> %2,
+ iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vfclass.nxv32i16.nxv32bf16(
+ <vscale x 32 x i16>,
+ <vscale x 32 x bfloat>,
+ iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vfclass_v_nxv32i16_nxv32bf16(
+; CHECK-LABEL: intrinsic_vfclass_v_nxv32i16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfclass.v v8, v8
+; CHECK-NEXT: ret
+ <vscale x 32 x bfloat> %0,
+ iXLen %1) nounwind {
+entry:
+ %a = call <vscale x 32 x i16> @llvm.riscv.vfclass.nxv32i16.nxv32bf16(
+ <vscale x 32 x i16> poison,
+ <vscale x 32 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vfclass.mask.nxv32i16.nxv32bf16(
+ <vscale x 32 x i16>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vfclass_mask_v_nxv32i16_nxv32bf16(
+; CHECK-LABEL: intrinsic_vfclass_mask_v_nxv32i16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, tu, mu
+; CHECK-NEXT: vfclass.v v8, v16, v0.t
+; CHECK-NEXT: ret
+ <vscale x 32 x i16> %0,
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x i1> %2,
+ iXLen %3) nounwind {
+entry:
+ %a = call <vscale x 32 x i16> @llvm.riscv.vfclass.mask.nxv32i16.nxv32bf16(
+ <vscale x 32 x i16> %0,
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x i1> %2,
+ iXLen %3, iXLen 0)
+
+ ret <vscale x 32 x i16> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmacc-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmacc-bf.ll
new file mode 100644
index 0000000..13821d7
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmacc-bf.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfmacc.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmacc_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmacc.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmacc_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfmacc.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmacc_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmacc.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmacc_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfmacc.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmacc_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmacc.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmacc_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfmacc.vv v8, v10, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmacc_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfmacc.vv v8, v10, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmacc.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmacc_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfmacc.vv v8, v12, v16
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmacc_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfmacc.vv v8, v12, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmacc_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfmacc.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmacc_mask_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfmacc.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmacc.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmacc_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfmacc.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmacc_mask_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfmacc.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmacc.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmacc_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfmacc.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmacc_mask_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfmacc.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmacc.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmacc_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfmacc.vf v8, fa0, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmacc_mask_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfmacc.vf v8, fa0, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmacc.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmacc_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfmacc.vf v8, fa0, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmacc_mask_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfmacc.vf v8, fa0, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmadd.vv v8, v10, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %2,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmadd.vv v8, v10, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x bfloat> %0,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmacc_vf_nxv1bf16_bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vf_nxv1bf16_bf16_nxv1bf16_commute:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmadd.vf v8, fa0, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %2,
+ bfloat %1,
+ <vscale x 1 x bfloat> %0,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmadd-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmadd-bf.ll
new file mode 100644
index 0000000..09fc199
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmadd-bf.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfmadd.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmadd_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfmadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmadd.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmadd_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfmadd.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmadd_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfmadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmadd.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmadd_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfmadd.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmadd_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfmadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmadd.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmadd_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfmadd.vv v8, v10, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmadd_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfmadd.vv v8, v10, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmadd.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmadd_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfmadd.vv v8, v12, v16
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmadd_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfmadd.vv v8, v12, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmadd_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfmadd.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmadd_mask_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmadd.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmadd_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfmadd.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmadd_mask_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmadd.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmadd_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfmadd.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmadd_mask_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmadd.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmadd_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfmadd.vf v8, fa0, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmadd_mask_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfmadd.vf v8, fa0, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmadd.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmadd_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfmadd.vf v8, fa0, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmadd_mask_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfmadd.vf v8, fa0, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmadd.vv v8, v9, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %2,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmacc.vv v8, v10, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x bfloat> %0,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmadd_vf_nxv1bf16_bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vf_nxv1bf16_bf16_nxv1bf16_commute:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmacc.vf v8, fa0, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %2,
+ bfloat %1,
+ <vscale x 1 x bfloat> %0,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-bf.ll
new file mode 100644
index 0000000..a337d30
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-bf.ll
@@ -0,0 +1,571 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmax.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmax_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmax.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmax_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfmax.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmax.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmax_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfmax.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmax_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfmax.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmax.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmax_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfmax.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmax_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfmax.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmax.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmax_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfmax.vv v8, v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmax_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfmax.vv v8, v10, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmax.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmax_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfmax.vv v8, v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmax_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfmax.vv v8, v12, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmax.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmax_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfmax.vv v8, v8, v16
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmax_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8re16.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e16alt, m8, ta, mu
+; CHECK-NEXT: vfmax.vv v8, v16, v24, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x bfloat> %2,
+ <vscale x 32 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmax.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmax_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmax.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmax_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfmax.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmax.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmax_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfmax.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmax_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfmax.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmax.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmax_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfmax.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmax_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfmax.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmax.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmax_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfmax.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmax_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfmax.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmax.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmax_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfmax.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmax_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfmax.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmax.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmax_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfmax.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ <vscale x 32 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmax_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT: vfmax.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ bfloat %2,
+ <vscale x 32 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmerge-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmerge-bf.ll
new file mode 100644
index 0000000..86ba7c7
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmerge-bf.ll
@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmerge.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmerge_vfm_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vfm_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmerge.vfm v8, v8, fa0, v0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmerge.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x i1> %2,
+ iXLen %3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmerge.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmerge_vfm_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vfm_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfmerge.vfm v8, v8, fa0, v0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmerge.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x i1> %2,
+ iXLen %3)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmerge.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmerge_vfm_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vfm_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfmerge.vfm v8, v8, fa0, v0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmerge.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x i1> %2,
+ iXLen %3)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmerge.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmerge_vfm_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vfm_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfmerge.vfm v8, v8, fa0, v0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmerge.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x i1> %2,
+ iXLen %3)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmerge.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmerge_vfm_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vfm_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfmerge.vfm v8, v8, fa0, v0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmerge.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x i1> %2,
+ iXLen %3)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmerge.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ <vscale x 32 x i1>,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmerge_vfm_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vfm_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfmerge.vfm v8, v8, fa0, v0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmerge.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ bfloat %1,
+ <vscale x 32 x i1> %2,
+ iXLen %3)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmerge_vzm_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x i1> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vzm_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fmv.h.x fa5, zero
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmerge.vfm v8, v8, fa5, v0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmerge.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ bfloat zeroinitializer,
+ <vscale x 1 x i1> %1,
+ iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat> @intrinsic_vfmerge_vzm_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x i1> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vzm_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fmv.h.x fa5, zero
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfmerge.vfm v8, v8, fa5, v0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmerge.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ bfloat zeroinitializer,
+ <vscale x 2 x i1> %1,
+ iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat> @intrinsic_vfmerge_vzm_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x i1> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vzm_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fmv.h.x fa5, zero
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfmerge.vfm v8, v8, fa5, v0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmerge.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ bfloat zeroinitializer,
+ <vscale x 4 x i1> %1,
+ iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat> @intrinsic_vfmerge_vzm_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x i1> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vzm_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fmv.h.x fa5, zero
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfmerge.vfm v8, v8, fa5, v0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmerge.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ bfloat zeroinitializer,
+ <vscale x 8 x i1> %1,
+ iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat> @intrinsic_vfmerge_vzm_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x i1> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vzm_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fmv.h.x fa5, zero
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfmerge.vfm v8, v8, fa5, v0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmerge.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ bfloat zeroinitializer,
+ <vscale x 16 x i1> %1,
+ iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat> @intrinsic_vfmerge_vzm_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x i1> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vzm_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fmv.h.x fa5, zero
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfmerge.vfm v8, v8, fa5, v0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmerge.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ bfloat zeroinitializer,
+ <vscale x 32 x i1> %1,
+ iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-bf.ll
new file mode 100644
index 0000000..37c0cf5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-bf.ll
@@ -0,0 +1,571 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmin.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmin_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmin.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmin_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfmin.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmin.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmin_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfmin.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmin_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfmin.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmin.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmin_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfmin.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmin_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfmin.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmin.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmin_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfmin.vv v8, v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmin_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfmin.vv v8, v10, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmin.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmin_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfmin.vv v8, v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmin_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfmin.vv v8, v12, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmin.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmin_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfmin.vv v8, v8, v16
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmin_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8re16.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e16alt, m8, ta, mu
+; CHECK-NEXT: vfmin.vv v8, v16, v24, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x bfloat> %2,
+ <vscale x 32 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmin.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmin_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmin.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmin_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfmin.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmin.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmin_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfmin.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmin_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfmin.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmin.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmin_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfmin.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmin_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfmin.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmin.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmin_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfmin.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmin_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfmin.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmin.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmin_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfmin.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmin_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfmin.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmin.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmin_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfmin.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ <vscale x 32 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmin_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT: vfmin.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ bfloat %2,
+ <vscale x 32 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsac-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsac-bf.ll
new file mode 100644
index 0000000..948d219
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmsac-bf.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfmsac.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmsac_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmsac.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmsac_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfmsac.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmsac_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmsac.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmsac_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfmsac.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmsac_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmsac.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmsac_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfmsac.vv v8, v10, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmsac_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfmsac.vv v8, v10, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmsac.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmsac_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfmsac.vv v8, v12, v16
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmsac_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfmsac.vv v8, v12, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmsac_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfmsac.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmsac_mask_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmsac.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmsac_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfmsac.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmsac_mask_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmsac.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmsac_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfmsac.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmsac_mask_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmsac.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmsac_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfmsac.vf v8, fa0, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmsac_mask_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfmsac.vf v8, fa0, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmsac.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmsac_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfmsac.vf v8, fa0, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmsac_mask_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfmsac.vf v8, fa0, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmsub.vv v8, v10, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %2,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmsub.vv v8, v10, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x bfloat> %0,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmsac_vf_nxv1bf16_bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vf_nxv1bf16_bf16_nxv1bf16_commute:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmsub.vf v8, fa0, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %2,
+ bfloat %1,
+ <vscale x 1 x bfloat> %0,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsub-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsub-bf.ll
new file mode 100644
index 0000000..6838f37
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmsub-bf.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfmsub.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmsub_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfmsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmsub.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmsub_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfmsub.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmsub_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfmsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmsub.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmsub_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfmsub.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmsub_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfmsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmsub.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmsub_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfmsub.vv v8, v10, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmsub_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfmsub.vv v8, v10, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmsub.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmsub_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfmsub.vv v8, v12, v16
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmsub_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfmsub.vv v8, v12, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmsub_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfmsub.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmsub_mask_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfmsub.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmsub.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmsub_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfmsub.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmsub_mask_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfmsub.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmsub.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmsub_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfmsub.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmsub_mask_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfmsub.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmsub.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmsub_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfmsub.vf v8, fa0, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmsub_mask_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfmsub.vf v8, fa0, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmsub.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmsub_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfmsub.vf v8, fa0, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmsub_mask_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfmsub.vf v8, fa0, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmsub.vv v8, v9, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %2,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmsac.vv v8, v10, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x bfloat> %0,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmsub_vf_nxv1bf16_bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vf_nxv1bf16_bf16_nxv1bf16_commute:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmsac.vf v8, fa0, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %2,
+ bfloat %1,
+ <vscale x 1 x bfloat> %0,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-bf.ll
new file mode 100644
index 0000000..44bce72
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-bf.ll
@@ -0,0 +1,607 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmul.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmul_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmul.vv v8, v8, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmul_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfmul.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmul.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmul_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfmul.vv v8, v8, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmul_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfmul.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmul.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmul_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfmul.vv v8, v8, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmul_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfmul.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmul.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmul_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfmul.vv v8, v8, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmul_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfmul.vv v8, v10, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmul.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmul_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfmul.vv v8, v8, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmul_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfmul.vv v8, v12, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmul.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmul_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfmul.vv v8, v8, v16
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmul_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8re16.v v24, (a0)
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vsetvli zero, a1, e16alt, m8, ta, mu
+; CHECK-NEXT: vfmul.vv v8, v16, v24, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x bfloat> %2,
+ <vscale x 32 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmul.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmul_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmul.vf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmul_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfmul.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmul.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmul_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfmul.vf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmul_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfmul.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmul.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmul_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfmul.vf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmul_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfmul.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmul.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmul_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfmul.vf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmul_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfmul.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmul.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmul_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfmul.vf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmul_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfmul.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmul.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmul_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfmul.vf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ <vscale x 32 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmul_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT: vfmul.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ bfloat %2,
+ <vscale x 32 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmv-bf-s.ll b/llvm/test/CodeGen/RISCV/rvv/vfmv-bf-s.ll
new file mode 100644
index 0000000..fbc73119
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmv-bf-s.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+d,+v,+experimental-zvfbfa -target-abi lp64d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+v,+experimental-zvfbfa -target-abi ilp32d -verify-machineinstrs < %s | FileCheck %s
+
+declare bfloat @llvm.riscv.vfmv.f.s.nxv1bf16(<vscale x 1 x bfloat>)
+
+define bfloat @intrinsic_vfmv.f.s_s_nxv1bf16(<vscale x 1 x bfloat> %0) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: fmv.h.x fa0, a0
+; CHECK-NEXT: ret
+entry:
+ %a = call bfloat @llvm.riscv.vfmv.f.s.nxv1bf16(<vscale x 1 x bfloat> %0)
+ ret bfloat %a
+}
+
+declare bfloat @llvm.riscv.vfmv.f.s.nxv2bf16(<vscale x 2 x bfloat>)
+
+define bfloat @intrinsic_vfmv.f.s_s_nxv2bf16(<vscale x 2 x bfloat> %0) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: fmv.h.x fa0, a0
+; CHECK-NEXT: ret
+entry:
+ %a = call bfloat @llvm.riscv.vfmv.f.s.nxv2bf16(<vscale x 2 x bfloat> %0)
+ ret bfloat %a
+}
+
+declare bfloat @llvm.riscv.vfmv.f.s.nxv4bf16(<vscale x 4 x bfloat>)
+
+define bfloat @intrinsic_vfmv.f.s_s_nxv4bf16(<vscale x 4 x bfloat> %0) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: fmv.h.x fa0, a0
+; CHECK-NEXT: ret
+entry:
+ %a = call bfloat @llvm.riscv.vfmv.f.s.nxv4bf16(<vscale x 4 x bfloat> %0)
+ ret bfloat %a
+}
+
+declare bfloat @llvm.riscv.vfmv.f.s.nxv8bf16(<vscale x 8 x bfloat>)
+
+define bfloat @intrinsic_vfmv.f.s_s_nxv8bf16(<vscale x 8 x bfloat> %0) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: fmv.h.x fa0, a0
+; CHECK-NEXT: ret
+entry:
+ %a = call bfloat @llvm.riscv.vfmv.f.s.nxv8bf16(<vscale x 8 x bfloat> %0)
+ ret bfloat %a
+}
+
+declare bfloat @llvm.riscv.vfmv.f.s.nxv16bf16(<vscale x 16 x bfloat>)
+
+define bfloat @intrinsic_vfmv.f.s_s_nxv16bf16(<vscale x 16 x bfloat> %0) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: fmv.h.x fa0, a0
+; CHECK-NEXT: ret
+entry:
+ %a = call bfloat @llvm.riscv.vfmv.f.s.nxv16bf16(<vscale x 16 x bfloat> %0)
+ ret bfloat %a
+}
+
+declare bfloat @llvm.riscv.vfmv.f.s.nxv32bf16(<vscale x 32 x bfloat>)
+
+define bfloat @intrinsic_vfmv.f.s_s_nxv32bf16(<vscale x 32 x bfloat> %0) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: fmv.h.x fa0, a0
+; CHECK-NEXT: ret
+entry:
+ %a = call bfloat @llvm.riscv.vfmv.f.s.nxv32bf16(<vscale x 32 x bfloat> %0)
+ ret bfloat %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmv-s-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmv-s-bf.ll
new file mode 100644
index 0000000..a810809
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmv-s-bf.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s --check-prefixes=CHECK
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmv.s.f.nxv1bf16(<vscale x 1 x bfloat>, bfloat, iXLen)
+
+define <vscale x 1 x bfloat> @intrinsic_vfmv.s.f_f_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfmv.s.f v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmv.s.f.nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2)
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmv.s.f.nxv2bf16(<vscale x 2 x bfloat>, bfloat, iXLen)
+
+define <vscale x 2 x bfloat> @intrinsic_vfmv.s.f_f_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfmv.s.f v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmv.s.f.nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2)
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmv.s.f.nxv4bf16(<vscale x 4 x bfloat>, bfloat, iXLen)
+
+define <vscale x 4 x bfloat> @intrinsic_vfmv.s.f_f_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfmv.s.f v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmv.s.f.nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2)
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmv.s.f.nxv8bf16(<vscale x 8 x bfloat>, bfloat, iXLen)
+
+define <vscale x 8 x bfloat> @intrinsic_vfmv.s.f_f_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfmv.s.f v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmv.s.f.nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2)
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmv.s.f.nxv16bf16(<vscale x 16 x bfloat>, bfloat, iXLen)
+
+define <vscale x 16 x bfloat> @intrinsic_vfmv.s.f_f_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfmv.s.f v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmv.s.f.nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2)
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmv.s.f.nxv32bf16(<vscale x 32 x bfloat>, bfloat, iXLen)
+
+define <vscale x 32 x bfloat> @intrinsic_vfmv.s.f_f_nxv32bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfmv.s.f v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmv.s.f.nxv32bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2)
+ ret <vscale x 32 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmv.s.f_f_zero_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_zero_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmv.s.f.nxv1bf16(<vscale x 1 x bfloat> %0, bfloat 0.0, iXLen %1)
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat> @intrinsic_vfmv.s.f_f_zero_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_zero_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmv.s.f.nxv2bf16(<vscale x 2 x bfloat> %0, bfloat 0.0, iXLen %1)
+ ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat> @intrinsic_vfmv.s.f_f_zero_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_zero_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmv.s.f.nxv4bf16(<vscale x 4 x bfloat> %0, bfloat 0.0, iXLen %1)
+ ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat> @intrinsic_vfmv.s.f_f_zero_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_zero_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmv.s.f.nxv8bf16(<vscale x 8 x bfloat> %0, bfloat 0.0, iXLen %1)
+ ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat> @intrinsic_vfmv.s.f_f_zero_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_zero_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmv.s.f.nxv16bf16(<vscale x 16 x bfloat> %0, bfloat 0.0, iXLen %1)
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat> @intrinsic_vfmv.s.f_f_zero_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_zero_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmv.s.f.nxv32bf16(<vscale x 32 x bfloat> %0, bfloat 0.0, iXLen %1)
+ ret <vscale x 32 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmv.s.f_f_nxv1bf16_negzero(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv1bf16_negzero:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lui a1, 1048568
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT: vmv.s.x v8, a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmv.s.f.nxv1bf16(<vscale x 1 x bfloat> %0, bfloat -0.0, iXLen %1)
+ ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmv-v-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmv-v-bf.ll
new file mode 100644
index 0000000..f3293dd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmv-v-bf.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmv.v.f.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmv.v.f_f_nxv1bf16(bfloat %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.v.f_f_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfmv.v.f v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmv.v.f.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ bfloat %0,
+ iXLen %1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmv.v.f.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmv.v.f_f_nxv2bf16(bfloat %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.v.f_f_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfmv.v.f v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmv.v.f.nxv2bf16(
+ <vscale x 2 x bfloat> poison,
+ bfloat %0,
+ iXLen %1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmv.v.f.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmv.v.f_f_nxv4bf16(bfloat %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.v.f_f_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfmv.v.f v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmv.v.f.nxv4bf16(
+ <vscale x 4 x bfloat> poison,
+ bfloat %0,
+ iXLen %1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmv.v.f.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmv.v.f_f_nxv8bf16(bfloat %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.v.f_f_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfmv.v.f v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmv.v.f.nxv8bf16(
+ <vscale x 8 x bfloat> poison,
+ bfloat %0,
+ iXLen %1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmv.v.f.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmv.v.f_f_nxv16bf16(bfloat %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.v.f_f_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfmv.v.f v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmv.v.f.nxv16bf16(
+ <vscale x 16 x bfloat> poison,
+ bfloat %0,
+ iXLen %1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmv.v.f.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmv.v.f_f_nxv32bf16(bfloat %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.v.f_f_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfmv.v.f v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmv.v.f.nxv32bf16(
+ <vscale x 32 x bfloat> poison,
+ bfloat %0,
+ iXLen %1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmv.v.f_zero_nxv1bf16(iXLen %0) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.v.f_zero_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmv.v.f.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ bfloat 0.0,
+ iXLen %0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat> @intrinsic_vmv.v.i_zero_nxv2bf16(iXLen %0) nounwind {
+; CHECK-LABEL: intrinsic_vmv.v.i_zero_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmv.v.f.nxv2bf16(
+ <vscale x 2 x bfloat> poison,
+ bfloat 0.0,
+ iXLen %0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat> @intrinsic_vmv.v.i_zero_nxv4bf16(iXLen %0) nounwind {
+; CHECK-LABEL: intrinsic_vmv.v.i_zero_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmv.v.f.nxv4bf16(
+ <vscale x 4 x bfloat> poison,
+ bfloat 0.0,
+ iXLen %0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat> @intrinsic_vmv.v.i_zero_nxv8bf16(iXLen %0) nounwind {
+; CHECK-LABEL: intrinsic_vmv.v.i_zero_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmv.v.f.nxv8bf16(
+ <vscale x 8 x bfloat> poison,
+ bfloat 0.0,
+ iXLen %0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat> @intrinsic_vmv.v.i_zero_nxv16bf16(iXLen %0) nounwind {
+; CHECK-LABEL: intrinsic_vmv.v.i_zero_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmv.v.f.nxv16bf16(
+ <vscale x 16 x bfloat> poison,
+ bfloat 0.0,
+ iXLen %0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat> @intrinsic_vmv.v.i_zero_nxv32bf16(iXLen %0) nounwind {
+; CHECK-LABEL: intrinsic_vmv.v.i_zero_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmv.v.f.nxv32bf16(
+ <vscale x 32 x bfloat> poison,
+ bfloat 0.0,
+ iXLen %0)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-rod-bf-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rod-bf-f.ll
new file mode 100644
index 0000000..7d587fd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rod-bf-f.ll
@@ -0,0 +1,226 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv1bf16.nxv1f32(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x float>,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfncvt_rod.f.f.w_nxv1bf16_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rod.f.f.w_nxv1bf16_nxv1f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfncvt.rod.f.f.w v9, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv1bf16.nxv1f32(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x float> %0,
+ iXLen %1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv1bf16.nxv1f32(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x float>,
+ <vscale x 1 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfncvt_mask_rod.f.f.w_nxv1bf16_nxv1f32(<vscale x 1 x bfloat> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rod.f.f.w_nxv1bf16_nxv1f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfncvt.rod.f.f.w v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv1bf16.nxv1f32(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x float> %1,
+ <vscale x 1 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv2bf16.nxv2f32(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x float>,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfncvt_rod.f.f.w_nxv2bf16_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rod.f.f.w_nxv2bf16_nxv2f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfncvt.rod.f.f.w v9, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv2bf16.nxv2f32(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x float> %0,
+ iXLen %1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv2bf16.nxv2f32(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x float>,
+ <vscale x 2 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfncvt_mask_rod.f.f.w_nxv2bf16_nxv2f32(<vscale x 2 x bfloat> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rod.f.f.w_nxv2bf16_nxv2f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfncvt.rod.f.f.w v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv2bf16.nxv2f32(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x float> %1,
+ <vscale x 2 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv4bf16.nxv4f32(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x float>,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfncvt_rod.f.f.w_nxv4bf16_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rod.f.f.w_nxv4bf16_nxv4f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfncvt.rod.f.f.w v10, v8
+; CHECK-NEXT: vmv.v.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv4bf16.nxv4f32(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x float> %0,
+ iXLen %1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv4bf16.nxv4f32(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x float>,
+ <vscale x 4 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfncvt_mask_rod.f.f.w_nxv4bf16_nxv4f32(<vscale x 4 x bfloat> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rod.f.f.w_nxv4bf16_nxv4f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfncvt.rod.f.f.w v8, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv4bf16.nxv4f32(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x float> %1,
+ <vscale x 4 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv8bf16.nxv8f32(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x float>,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfncvt_rod.f.f.w_nxv8bf16_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rod.f.f.w_nxv8bf16_nxv8f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfncvt.rod.f.f.w v12, v8
+; CHECK-NEXT: vmv.v.v v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv8bf16.nxv8f32(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x float> %0,
+ iXLen %1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv8bf16.nxv8f32(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x float>,
+ <vscale x 8 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfncvt_mask_rod.f.f.w_nxv8bf16_nxv8f32(<vscale x 8 x bfloat> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rod.f.f.w_nxv8bf16_nxv8f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfncvt.rod.f.f.w v8, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv8bf16.nxv8f32(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x float> %1,
+ <vscale x 8 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv16bf16.nxv16f32(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x float>,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfncvt_rod.f.f.w_nxv16bf16_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rod.f.f.w_nxv16bf16_nxv16f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfncvt.rod.f.f.w v16, v8
+; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv16bf16.nxv16f32(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x float> %0,
+ iXLen %1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv16bf16.nxv16f32(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x float>,
+ <vscale x 16 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfncvt_mask_rod.f.f.w_nxv16bf16_nxv16f32(<vscale x 16 x bfloat> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rod.f.f.w_nxv16bf16_nxv16f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfncvt.rod.f.f.w v8, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv16bf16.nxv16f32(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x float> %1,
+ <vscale x 16 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-x-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-x-bf.ll
new file mode 100644
index 0000000..ee9e3d1
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-x-bf.ll
@@ -0,0 +1,270 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv1i8.nxv1bf16(
+ <vscale x 1 x i8>,
+ <vscale x 1 x bfloat>,
+ iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vfncvt_rtz.x.f.w_nxv1i8_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.x.f.w_nxv1i8_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf8, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v9, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv1i8.nxv1bf16(
+ <vscale x 1 x i8> poison,
+ <vscale x 1 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv1i8.nxv1bf16(
+ <vscale x 1 x i8>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vfncvt_mask_rtz.x.f.w_nxv1i8_nxv1bf16(<vscale x 1 x i8> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.x.f.w_nxv1i8_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf8, ta, mu
+; CHECK-NEXT: vfncvt.rtz.x.f.w v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv1i8.nxv1bf16(
+ <vscale x 1 x i8> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv2i8.nxv2bf16(
+ <vscale x 2 x i8>,
+ <vscale x 2 x bfloat>,
+ iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vfncvt_rtz.x.f.w_nxv2i8_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.x.f.w_nxv2i8_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf4, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v9, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv2i8.nxv2bf16(
+ <vscale x 2 x i8> poison,
+ <vscale x 2 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv2i8.nxv2bf16(
+ <vscale x 2 x i8>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vfncvt_mask_rtz.x.f.w_nxv2i8_nxv2bf16(<vscale x 2 x i8> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.x.f.w_nxv2i8_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf4, ta, mu
+; CHECK-NEXT: vfncvt.rtz.x.f.w v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv2i8.nxv2bf16(
+ <vscale x 2 x i8> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv4i8.nxv4bf16(
+ <vscale x 4 x i8>,
+ <vscale x 4 x bfloat>,
+ iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vfncvt_rtz.x.f.w_nxv4i8_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.x.f.w_nxv4i8_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf2, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v9, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv4i8.nxv4bf16(
+ <vscale x 4 x i8> poison,
+ <vscale x 4 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv4i8.nxv4bf16(
+ <vscale x 4 x i8>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vfncvt_mask_rtz.x.f.w_nxv4i8_nxv4bf16(<vscale x 4 x i8> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.x.f.w_nxv4i8_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf2, ta, mu
+; CHECK-NEXT: vfncvt.rtz.x.f.w v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv4i8.nxv4bf16(
+ <vscale x 4 x i8> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv8i8.nxv8bf16(
+ <vscale x 8 x i8>,
+ <vscale x 8 x bfloat>,
+ iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vfncvt_rtz.x.f.w_nxv8i8_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.x.f.w_nxv8i8_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m1, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v10, v8
+; CHECK-NEXT: vmv.v.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv8i8.nxv8bf16(
+ <vscale x 8 x i8> poison,
+ <vscale x 8 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv8i8.nxv8bf16(
+ <vscale x 8 x i8>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vfncvt_mask_rtz.x.f.w_nxv8i8_nxv8bf16(<vscale x 8 x i8> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.x.f.w_nxv8i8_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m1, ta, mu
+; CHECK-NEXT: vfncvt.rtz.x.f.w v8, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv8i8.nxv8bf16(
+ <vscale x 8 x i8> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv16i8.nxv16bf16(
+ <vscale x 16 x i8>,
+ <vscale x 16 x bfloat>,
+ iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vfncvt_rtz.x.f.w_nxv16i8_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.x.f.w_nxv16i8_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m2, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v8
+; CHECK-NEXT: vmv.v.v v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv16i8.nxv16bf16(
+ <vscale x 16 x i8> poison,
+ <vscale x 16 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv16i8.nxv16bf16(
+ <vscale x 16 x i8>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vfncvt_mask_rtz.x.f.w_nxv16i8_nxv16bf16(<vscale x 16 x i8> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.x.f.w_nxv16i8_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m2, ta, mu
+; CHECK-NEXT: vfncvt.rtz.x.f.w v8, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv16i8.nxv16bf16(
+ <vscale x 16 x i8> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv32i8.nxv32bf16(
+ <vscale x 32 x i8>,
+ <vscale x 32 x bfloat>,
+ iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vfncvt_rtz.x.f.w_nxv32i8_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.x.f.w_nxv32i8_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m4, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v16, v8
+; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv32i8.nxv32bf16(
+ <vscale x 32 x i8> poison,
+ <vscale x 32 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv32i8.nxv32bf16(
+ <vscale x 32 x i8>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vfncvt_mask_rtz.x.f.w_nxv32i8_nxv32bf16(<vscale x 32 x i8> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.x.f.w_nxv32i8_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m4, ta, mu
+; CHECK-NEXT: vfncvt.rtz.x.f.w v8, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv32i8.nxv32bf16(
+ <vscale x 32 x i8> %0,
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 32 x i8> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-xu-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-xu-bf.ll
new file mode 100644
index 0000000..521f727
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-xu-bf.ll
@@ -0,0 +1,270 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv1i8.nxv1bf16(
+ <vscale x 1 x i8>,
+ <vscale x 1 x bfloat>,
+ iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vfncvt_rtz.xu.f.w_nxv1i8_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.xu.f.w_nxv1i8_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf8, ta, ma
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v9, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv1i8.nxv1bf16(
+ <vscale x 1 x i8> poison,
+ <vscale x 1 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv1i8.nxv1bf16(
+ <vscale x 1 x i8>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vfncvt_mask_rtz.xu.f.w_nxv1i8_nxv1bf16(<vscale x 1 x i8> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.xu.f.w_nxv1i8_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf8, ta, mu
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv1i8.nxv1bf16(
+ <vscale x 1 x i8> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv2i8.nxv2bf16(
+ <vscale x 2 x i8>,
+ <vscale x 2 x bfloat>,
+ iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vfncvt_rtz.xu.f.w_nxv2i8_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.xu.f.w_nxv2i8_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf4, ta, ma
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v9, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv2i8.nxv2bf16(
+ <vscale x 2 x i8> poison,
+ <vscale x 2 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv2i8.nxv2bf16(
+ <vscale x 2 x i8>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vfncvt_mask_rtz.xu.f.w_nxv2i8_nxv2bf16(<vscale x 2 x i8> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.xu.f.w_nxv2i8_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf4, ta, mu
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv2i8.nxv2bf16(
+ <vscale x 2 x i8> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv4i8.nxv4bf16(
+ <vscale x 4 x i8>,
+ <vscale x 4 x bfloat>,
+ iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vfncvt_rtz.xu.f.w_nxv4i8_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.xu.f.w_nxv4i8_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf2, ta, ma
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v9, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv4i8.nxv4bf16(
+ <vscale x 4 x i8> poison,
+ <vscale x 4 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv4i8.nxv4bf16(
+ <vscale x 4 x i8>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vfncvt_mask_rtz.xu.f.w_nxv4i8_nxv4bf16(<vscale x 4 x i8> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.xu.f.w_nxv4i8_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf2, ta, mu
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv4i8.nxv4bf16(
+ <vscale x 4 x i8> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv8i8.nxv8bf16(
+ <vscale x 8 x i8>,
+ <vscale x 8 x bfloat>,
+ iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vfncvt_rtz.xu.f.w_nxv8i8_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.xu.f.w_nxv8i8_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m1, ta, ma
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v10, v8
+; CHECK-NEXT: vmv.v.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv8i8.nxv8bf16(
+ <vscale x 8 x i8> poison,
+ <vscale x 8 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv8i8.nxv8bf16(
+ <vscale x 8 x i8>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vfncvt_mask_rtz.xu.f.w_nxv8i8_nxv8bf16(<vscale x 8 x i8> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.xu.f.w_nxv8i8_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m1, ta, mu
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v8, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv8i8.nxv8bf16(
+ <vscale x 8 x i8> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv16i8.nxv16bf16(
+ <vscale x 16 x i8>,
+ <vscale x 16 x bfloat>,
+ iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vfncvt_rtz.xu.f.w_nxv16i8_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.xu.f.w_nxv16i8_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m2, ta, ma
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v8
+; CHECK-NEXT: vmv.v.v v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv16i8.nxv16bf16(
+ <vscale x 16 x i8> poison,
+ <vscale x 16 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv16i8.nxv16bf16(
+ <vscale x 16 x i8>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vfncvt_mask_rtz.xu.f.w_nxv16i8_nxv16bf16(<vscale x 16 x i8> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.xu.f.w_nxv16i8_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m2, ta, mu
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v8, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv16i8.nxv16bf16(
+ <vscale x 16 x i8> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv32i8.nxv32bf16(
+ <vscale x 32 x i8>,
+ <vscale x 32 x bfloat>,
+ iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vfncvt_rtz.xu.f.w_nxv32i8_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.xu.f.w_nxv32i8_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m4, ta, ma
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v16, v8
+; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv32i8.nxv32bf16(
+ <vscale x 32 x i8> poison,
+ <vscale x 32 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv32i8.nxv32bf16(
+ <vscale x 32 x i8>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vfncvt_mask_rtz.xu.f.w_nxv32i8_nxv32bf16(<vscale x 32 x i8> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.xu.f.w_nxv32i8_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m4, ta, mu
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v8, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv32i8.nxv32bf16(
+ <vscale x 32 x i8> %0,
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 32 x i8> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-x-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-x-bf.ll
new file mode 100644
index 0000000..ab9ebad
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-x-bf.ll
@@ -0,0 +1,288 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.nxv1i8.nxv1bf16(
+ <vscale x 1 x i8>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vfncvt_x.f.w_nxv1i8_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv1i8_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf8, ta, ma
+; CHECK-NEXT: vfncvt.x.f.w v9, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.nxv1i8.nxv1bf16(
+ <vscale x 1 x i8> poison,
+ <vscale x 1 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16(
+ <vscale x 1 x i8>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv1i8_nxv1bf16(<vscale x 1 x i8> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv1i8_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf8, ta, mu
+; CHECK-NEXT: vfncvt.x.f.w v8, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16(
+ <vscale x 1 x i8> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.nxv2i8.nxv2bf16(
+ <vscale x 2 x i8>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vfncvt_x.f.w_nxv2i8_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv2i8_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf4, ta, ma
+; CHECK-NEXT: vfncvt.x.f.w v9, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.nxv2i8.nxv2bf16(
+ <vscale x 2 x i8> poison,
+ <vscale x 2 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16(
+ <vscale x 2 x i8>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv2i8_nxv2bf16(<vscale x 2 x i8> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv2i8_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf4, ta, mu
+; CHECK-NEXT: vfncvt.x.f.w v8, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16(
+ <vscale x 2 x i8> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.nxv4i8.nxv4bf16(
+ <vscale x 4 x i8>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vfncvt_x.f.w_nxv4i8_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv4i8_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf2, ta, ma
+; CHECK-NEXT: vfncvt.x.f.w v9, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.nxv4i8.nxv4bf16(
+ <vscale x 4 x i8> poison,
+ <vscale x 4 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16(
+ <vscale x 4 x i8>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv4i8_nxv4bf16(<vscale x 4 x i8> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv4i8_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf2, ta, mu
+; CHECK-NEXT: vfncvt.x.f.w v8, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16(
+ <vscale x 4 x i8> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.nxv8i8.nxv8bf16(
+ <vscale x 8 x i8>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vfncvt_x.f.w_nxv8i8_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv8i8_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m1, ta, ma
+; CHECK-NEXT: vfncvt.x.f.w v10, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv.v.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.nxv8i8.nxv8bf16(
+ <vscale x 8 x i8> poison,
+ <vscale x 8 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16(
+ <vscale x 8 x i8>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv8i8_nxv8bf16(<vscale x 8 x i8> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv8i8_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m1, ta, mu
+; CHECK-NEXT: vfncvt.x.f.w v8, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16(
+ <vscale x 8 x i8> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.nxv16i8.nxv16bf16(
+ <vscale x 16 x i8>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vfncvt_x.f.w_nxv16i8_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv16i8_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m2, ta, ma
+; CHECK-NEXT: vfncvt.x.f.w v12, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv.v.v v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.nxv16i8.nxv16bf16(
+ <vscale x 16 x i8> poison,
+ <vscale x 16 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16(
+ <vscale x 16 x i8>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv16i8_nxv16bf16(<vscale x 16 x i8> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv16i8_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m2, ta, mu
+; CHECK-NEXT: vfncvt.x.f.w v8, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16(
+ <vscale x 16 x i8> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.nxv32i8.nxv32bf16(
+ <vscale x 32 x i8>,
+ <vscale x 32 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vfncvt_x.f.w_nxv32i8_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv32i8_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m4, ta, ma
+; CHECK-NEXT: vfncvt.x.f.w v16, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.nxv32i8.nxv32bf16(
+ <vscale x 32 x i8> poison,
+ <vscale x 32 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16(
+ <vscale x 32 x i8>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv32i8_nxv32bf16(<vscale x 32 x i8> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv32i8_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m4, ta, mu
+; CHECK-NEXT: vfncvt.x.f.w v8, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16(
+ <vscale x 32 x i8> %0,
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 32 x i8> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-xu-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-xu-bf.ll
new file mode 100644
index 0000000..61c6803
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-xu-bf.ll
@@ -0,0 +1,288 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv1i8.nxv1bf16(
+ <vscale x 1 x i8>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vfncvt_xu.f.w_nxv1i8_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv1i8_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf8, ta, ma
+; CHECK-NEXT: vfncvt.xu.f.w v9, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv1i8.nxv1bf16(
+ <vscale x 1 x i8> poison,
+ <vscale x 1 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16(
+ <vscale x 1 x i8>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv1i8_nxv1bf16(<vscale x 1 x i8> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv1i8_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf8, ta, mu
+; CHECK-NEXT: vfncvt.xu.f.w v8, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16(
+ <vscale x 1 x i8> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv2i8.nxv2bf16(
+ <vscale x 2 x i8>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vfncvt_xu.f.w_nxv2i8_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv2i8_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf4, ta, ma
+; CHECK-NEXT: vfncvt.xu.f.w v9, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv2i8.nxv2bf16(
+ <vscale x 2 x i8> poison,
+ <vscale x 2 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16(
+ <vscale x 2 x i8>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv2i8_nxv2bf16(<vscale x 2 x i8> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv2i8_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf4, ta, mu
+; CHECK-NEXT: vfncvt.xu.f.w v8, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16(
+ <vscale x 2 x i8> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv4i8.nxv4bf16(
+ <vscale x 4 x i8>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vfncvt_xu.f.w_nxv4i8_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv4i8_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf2, ta, ma
+; CHECK-NEXT: vfncvt.xu.f.w v9, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv4i8.nxv4bf16(
+ <vscale x 4 x i8> poison,
+ <vscale x 4 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16(
+ <vscale x 4 x i8>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv4i8_nxv4bf16(<vscale x 4 x i8> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv4i8_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf2, ta, mu
+; CHECK-NEXT: vfncvt.xu.f.w v8, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16(
+ <vscale x 4 x i8> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv8i8.nxv8bf16(
+ <vscale x 8 x i8>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vfncvt_xu.f.w_nxv8i8_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv8i8_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m1, ta, ma
+; CHECK-NEXT: vfncvt.xu.f.w v10, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv.v.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv8i8.nxv8bf16(
+ <vscale x 8 x i8> poison,
+ <vscale x 8 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16(
+ <vscale x 8 x i8>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv8i8_nxv8bf16(<vscale x 8 x i8> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv8i8_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m1, ta, mu
+; CHECK-NEXT: vfncvt.xu.f.w v8, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16(
+ <vscale x 8 x i8> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv16i8.nxv16bf16(
+ <vscale x 16 x i8>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vfncvt_xu.f.w_nxv16i8_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv16i8_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m2, ta, ma
+; CHECK-NEXT: vfncvt.xu.f.w v12, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv.v.v v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv16i8.nxv16bf16(
+ <vscale x 16 x i8> poison,
+ <vscale x 16 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16(
+ <vscale x 16 x i8>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv16i8_nxv16bf16(<vscale x 16 x i8> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv16i8_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m2, ta, mu
+; CHECK-NEXT: vfncvt.xu.f.w v8, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16(
+ <vscale x 16 x i8> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv32i8.nxv32bf16(
+ <vscale x 32 x i8>,
+ <vscale x 32 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vfncvt_xu.f.w_nxv32i8_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv32i8_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m4, ta, ma
+; CHECK-NEXT: vfncvt.xu.f.w v16, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv32i8.nxv32bf16(
+ <vscale x 32 x i8> poison,
+ <vscale x 32 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16(
+ <vscale x 32 x i8>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv32i8_nxv32bf16(<vscale x 32 x i8> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv32i8_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m4, ta, mu
+; CHECK-NEXT: vfncvt.xu.f.w v8, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16(
+ <vscale x 32 x i8> %0,
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 32 x i8> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmacc-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmacc-bf.ll
new file mode 100644
index 0000000..4b4091b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmacc-bf.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfnmacc.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmacc_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfnmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmacc_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfnmacc.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmacc_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfnmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmacc_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfnmacc.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmacc_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfnmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmacc_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfnmacc.vv v8, v10, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmacc_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfnmacc.vv v8, v10, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmacc_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfnmacc.vv v8, v12, v16
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmacc_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfnmacc.vv v8, v12, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmacc_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfnmacc.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmacc_mask_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfnmacc.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmacc_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfnmacc.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmacc_mask_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfnmacc.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmacc_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfnmacc.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmacc_mask_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfnmacc.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmacc_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfnmacc.vf v8, fa0, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmacc_mask_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfnmacc.vf v8, fa0, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmacc_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfnmacc.vf v8, fa0, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmacc_mask_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfnmacc.vf v8, fa0, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfnmadd.vv v8, v10, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %2,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfnmadd.vv v8, v10, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x bfloat> %0,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmacc_vf_nxv1bf16_bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv1bf16_bf16_nxv1bf16_commute:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfnmadd.vf v8, fa0, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %2,
+ bfloat %1,
+ <vscale x 1 x bfloat> %0,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmadd-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmadd-bf.ll
new file mode 100644
index 0000000..2bb6bf5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmadd-bf.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfnmadd.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmadd_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfnmadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmadd_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfnmadd.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmadd_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfnmadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmadd_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfnmadd.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmadd_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfnmadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmadd_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfnmadd.vv v8, v10, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmadd_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfnmadd.vv v8, v10, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmadd_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfnmadd.vv v8, v12, v16
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmadd_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfnmadd.vv v8, v12, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmadd_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfnmadd.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmadd_mask_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfnmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmadd_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfnmadd.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmadd_mask_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfnmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmadd_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfnmadd.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmadd_mask_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfnmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmadd_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfnmadd.vf v8, fa0, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmadd_mask_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfnmadd.vf v8, fa0, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmadd_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfnmadd.vf v8, fa0, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmadd_mask_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfnmadd.vf v8, fa0, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfnmadd.vv v8, v9, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %2,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfnmacc.vv v8, v10, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x bfloat> %0,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmadd_vf_nxv1bf16_bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv1bf16_bf16_nxv1bf16_commute:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfnmacc.vf v8, fa0, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %2,
+ bfloat %1,
+ <vscale x 1 x bfloat> %0,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmsac-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmsac-bf.ll
new file mode 100644
index 0000000..cfbaafa
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmsac-bf.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfnmsac.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmsac_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfnmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmsac_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfnmsac.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmsac_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfnmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmsac_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfnmsac.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmsac_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfnmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmsac_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfnmsac.vv v8, v10, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmsac_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfnmsac.vv v8, v10, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmsac_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfnmsac.vv v8, v12, v16
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmsac_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfnmsac.vv v8, v12, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmsac_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfnmsac.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmsac_mask_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfnmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmsac_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfnmsac.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmsac_mask_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfnmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmsac_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfnmsac.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmsac_mask_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfnmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmsac_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfnmsac.vf v8, fa0, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmsac_mask_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfnmsac.vf v8, fa0, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmsac_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfnmsac.vf v8, fa0, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmsac_mask_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfnmsac.vf v8, fa0, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfnmsub.vv v8, v10, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %2,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfnmsub.vv v8, v10, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x bfloat> %0,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmsac_vf_nxv1bf16_bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv1bf16_bf16_nxv1bf16_commute:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfnmsub.vf v8, fa0, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %2,
+ bfloat %1,
+ <vscale x 1 x bfloat> %0,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmsub-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmsub-bf.ll
new file mode 100644
index 0000000..5ebbb90c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmsub-bf.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfnmsub.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmsub_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfnmsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmsub_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfnmsub.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmsub_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfnmsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmsub_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfnmsub.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmsub_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfnmsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmsub_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfnmsub.vv v8, v10, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmsub_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfnmsub.vv v8, v10, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmsub_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfnmsub.vv v8, v12, v16
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmsub_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfnmsub.vv v8, v12, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmsub_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfnmsub.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmsub_mask_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfnmsub.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmsub_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfnmsub.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmsub_mask_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfnmsub.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmsub_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfnmsub.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmsub_mask_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfnmsub.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmsub_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfnmsub.vf v8, fa0, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmsub_mask_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfnmsub.vf v8, fa0, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmsub_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfnmsub.vf v8, fa0, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmsub_mask_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfnmsub.vf v8, fa0, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0);
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfnmsub.vv v8, v9, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %2,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfnmsac.vv v8, v10, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x bfloat> %0,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmsub_vf_nxv1bf16_bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv1bf16_bf16_nxv1bf16_commute:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfnmsac.vf v8, fa0, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %2,
+ bfloat %1,
+ <vscale x 1 x bfloat> %0,
+ iXLen 7, iXLen %3, iXLen 3)
+
+ ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrec7-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfrec7-bf.ll
new file mode 100644
index 0000000..1211415
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfrec7-bf.ll
@@ -0,0 +1,282 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfrec7.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfrec7_v_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfrec7.v v8, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfrec7.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfrec7.mask.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfrec7_mask_v_nxv1bf16_nxv1bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfrec7.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfrec7.mask.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %0,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfrec7.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfrec7_v_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfrec7.v v8, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfrec7.nxv2bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfrec7.mask.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfrec7_mask_v_nxv2bf16_nxv2bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfrec7.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfrec7.mask.nxv2bf16(
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %0,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfrec7.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfrec7_v_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfrec7.v v8, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfrec7.nxv4bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfrec7.mask.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfrec7_mask_v_nxv4bf16_nxv4bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfrec7.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfrec7.mask.nxv4bf16(
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %0,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfrec7.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfrec7_v_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfrec7.v v8, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfrec7.nxv8bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfrec7.mask.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfrec7_mask_v_nxv8bf16_nxv8bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfrec7.v v8, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfrec7.mask.nxv8bf16(
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %0,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfrec7.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfrec7_v_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfrec7.v v8, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfrec7.nxv16bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfrec7.mask.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfrec7_mask_v_nxv16bf16_nxv16bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfrec7.v v8, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfrec7.mask.nxv16bf16(
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %0,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfrec7.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfrec7_v_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfrec7.v v8, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfrec7.nxv32bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ iXLen 0, iXLen %1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfrec7.mask.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfrec7_mask_v_nxv32bf16_nxv32bf16(<vscale x 32 x i1> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT: vfrec7.v v8, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfrec7.mask.nxv32bf16(
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x bfloat> %2,
+ <vscale x 32 x i1> %0,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-bf16.ll
new file mode 100644
index 0000000..4626b86
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-bf16.ll
@@ -0,0 +1,264 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfrsqrt7_v_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfrsqrt7.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfrsqrt7_mask_v_nxv1bf16_nxv1bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %0,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfrsqrt7_v_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfrsqrt7.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.nxv2bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfrsqrt7_mask_v_nxv2bf16_nxv2bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv2bf16(
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %0,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfrsqrt7_v_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfrsqrt7.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.nxv4bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfrsqrt7_mask_v_nxv4bf16_nxv4bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv4bf16(
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %0,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfrsqrt7_v_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfrsqrt7.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.nxv8bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfrsqrt7_mask_v_nxv8bf16_nxv8bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfrsqrt7.v v8, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv8bf16(
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %0,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfrsqrt7_v_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfrsqrt7.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.nxv16bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfrsqrt7_mask_v_nxv16bf16_nxv16bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfrsqrt7.v v8, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv16bf16(
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %0,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfrsqrt7_v_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfrsqrt7.v v8, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.nxv32bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ iXLen %1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfrsqrt7_mask_v_nxv32bf16_nxv32bf16(<vscale x 32 x i1> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT: vfrsqrt7.v v8, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv32bf16(
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x bfloat> %2,
+ <vscale x 32 x i1> %0,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrsub-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfrsub-bf.ll
new file mode 100644
index 0000000..54a6d48
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfrsub-bf.ll
@@ -0,0 +1,282 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfrsub.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfrsub_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfrsub.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfrsub.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfrsub.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfrsub_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfrsub.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfrsub.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfrsub.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfrsub_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfrsub.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfrsub.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfrsub.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfrsub_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfrsub.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfrsub.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfrsub.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfrsub_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfrsub.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfrsub.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfrsub.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfrsub_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfrsub.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfrsub.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfrsub.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfrsub_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfrsub.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfrsub.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfrsub.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfrsub_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfrsub.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfrsub.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfrsub.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfrsub_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfrsub.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfrsub.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfrsub.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfrsub_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfrsub.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfrsub.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfrsub.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfrsub_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfrsub.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfrsub.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ bfloat %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfrsub.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ <vscale x 32 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfrsub_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT: vfrsub.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfrsub.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ bfloat %2,
+ <vscale x 32 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsgnj-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfsgnj-bf.ll
new file mode 100644
index 0000000..2cd698d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsgnj-bf.ll
@@ -0,0 +1,571 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnj_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfsgnj.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnj_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnj_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfsgnj.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnj_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnj_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfsgnj.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnj_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnj_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfsgnj.vv v8, v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnj_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v8, v10, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnj_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfsgnj.vv v8, v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnj_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v8, v12, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnj_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfsgnj.vv v8, v8, v16
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnj_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8re16.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e16alt, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v8, v16, v24, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x bfloat> %2,
+ <vscale x 32 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnj_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfsgnj.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnj_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfsgnj.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnj_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfsgnj.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnj_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnj_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfsgnj.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnj_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnj_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfsgnj.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnj_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnj_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfsgnj.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnj_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnj_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfsgnj.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ <vscale x 32 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnj_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ bfloat %2,
+ <vscale x 32 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsgnjn-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfsgnjn-bf.ll
new file mode 100644
index 0000000..08340be
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsgnjn-bf.ll
@@ -0,0 +1,571 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnjn_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfsgnjn.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnjn_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfsgnjn.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnjn_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfsgnjn.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnjn_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfsgnjn.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnjn_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfsgnjn.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnjn_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfsgnjn.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnjn_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfsgnjn.vv v8, v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnjn_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfsgnjn.vv v8, v10, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnjn_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfsgnjn.vv v8, v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnjn_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfsgnjn.vv v8, v12, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnjn_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfsgnjn.vv v8, v8, v16
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnjn_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8re16.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e16alt, m8, ta, mu
+; CHECK-NEXT: vfsgnjn.vv v8, v16, v24, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x bfloat> %2,
+ <vscale x 32 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnjn_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfsgnjn.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnjn_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfsgnjn.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnjn_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfsgnjn.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnjn_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfsgnjn.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnjn_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfsgnjn.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnjn_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfsgnjn.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnjn_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfsgnjn.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnjn_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfsgnjn.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnjn_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfsgnjn.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnjn_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfsgnjn.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnjn_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfsgnjn.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ <vscale x 32 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnjn_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT: vfsgnjn.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ bfloat %2,
+ <vscale x 32 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsgnjx-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfsgnjx-bf.ll
new file mode 100644
index 0000000..e51a42e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsgnjx-bf.ll
@@ -0,0 +1,571 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnjx_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfsgnjx.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnjx_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfsgnjx.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnjx_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfsgnjx.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnjx_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfsgnjx.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnjx_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfsgnjx.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnjx_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfsgnjx.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnjx_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfsgnjx.vv v8, v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnjx_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfsgnjx.vv v8, v10, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnjx_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfsgnjx.vv v8, v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnjx_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfsgnjx.vv v8, v12, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnjx_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfsgnjx.vv v8, v8, v16
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnjx_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8re16.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e16alt, m8, ta, mu
+; CHECK-NEXT: vfsgnjx.vv v8, v16, v24, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x bfloat> %2,
+ <vscale x 32 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnjx_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfsgnjx.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnjx_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfsgnjx.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnjx_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfsgnjx.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnjx_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfsgnjx.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnjx_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfsgnjx.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnjx_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfsgnjx.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnjx_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfsgnjx.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnjx_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfsgnjx.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnjx_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfsgnjx.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnjx_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfsgnjx.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnjx_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfsgnjx.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ <vscale x 32 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnjx_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT: vfsgnjx.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ bfloat %2,
+ <vscale x 32 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfslide1down-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfslide1down-bf.ll
new file mode 100644
index 0000000..c65719c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfslide1down-bf.ll
@@ -0,0 +1,288 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfslide1down_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfslide1down.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfslide1down_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfslide1down.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfslide1down_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfslide1down.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfslide1down_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfslide1down.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfslide1down_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfslide1down.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfslide1down_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfslide1down.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfslide1down_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfslide1down.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfslide1down_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfslide1down.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfslide1down_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfslide1down.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfslide1down_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfslide1down.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfslide1down_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfslide1down.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ <vscale x 32 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfslide1down_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT: vfslide1down.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ bfloat %2,
+ <vscale x 32 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfslide1up-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfslide1up-bf.ll
new file mode 100644
index 0000000..57a4898
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfslide1up-bf.ll
@@ -0,0 +1,294 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfslide1up_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfslide1up_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfslide1up.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfslide1up_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfslide1up_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfslide1up.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfslide1up_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
+; CHECK-NEXT: vmv.v.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfslide1up_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfslide1up.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfslide1up_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfslide1up.vf v10, v8, fa0
+; CHECK-NEXT: vmv.v.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfslide1up_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfslide1up.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfslide1up_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfslide1up.vf v12, v8, fa0
+; CHECK-NEXT: vmv.v.v v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfslide1up_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfslide1up.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfslide1up_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfslide1up.vf v16, v8, fa0
+; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ <vscale x 32 x i1>,
+ iXLen,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfslide1up_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT: vfslide1up.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ bfloat %2,
+ <vscale x 32 x i1> %3,
+ iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-bf.ll
new file mode 100644
index 0000000..aea7521
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-bf.ll
@@ -0,0 +1,559 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsub.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsub_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfsub.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsub_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsub.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsub_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfsub.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsub_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsub.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsub_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfsub.vv v8, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsub_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsub.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsub_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfsub.vv v8, v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsub_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfsub.vv v8, v10, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsub.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsub_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfsub.vv v8, v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsub_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfsub.vv v8, v12, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsub.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsub_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v8, v8, v16
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsub_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl8re16.v v24, (a0)
+; CHECK-NEXT: vsetvli zero, a1, e16alt, m8, ta, mu
+; CHECK-NEXT: vfsub.vv v8, v16, v24, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.nxv32bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ <vscale x 32 x bfloat> %2,
+ <vscale x 32 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsub.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsub_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfsub.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsub_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfsub.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsub.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsub_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfsub.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsub_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfsub.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsub.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsub_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfsub.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsub_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfsub.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsub.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsub_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfsub.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsub_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfsub.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsub.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsub_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfsub.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsub_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfsub.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsub.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsub_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT: vfsub.vf v8, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x bfloat> %0,
+ bfloat %1,
+ iXLen 7, iXLen %2)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x bfloat>,
+ bfloat,
+ <vscale x 32 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsub_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT: vfsub.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.bf16(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x bfloat> %1,
+ bfloat %2,
+ <vscale x 32 x i1> %3,
+ iXLen 7, iXLen %4, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwadd-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfwadd-bf.ll
new file mode 100644
index 0000000..62feac8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwadd-bf.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.nxv1bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwadd_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfwadd.vv v10, v8, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.nxv1bf16(
+ <vscale x 1 x float> poison,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwadd_mask_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfwadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.nxv2bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwadd_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfwadd.vv v10, v8, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.nxv2bf16(
+ <vscale x 2 x float> poison,
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwadd_mask_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfwadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.nxv4bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwadd_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v10, v9
+; CHECK-NEXT: vmv1r.v v11, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwadd.vv v8, v11, v10
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.nxv4bf16(
+ <vscale x 4 x float> poison,
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwadd_mask_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfwadd.vv v8, v10, v11, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.nxv8bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwadd_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmv2r.v v12, v10
+; CHECK-NEXT: vmv2r.v v14, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwadd.vv v8, v14, v12
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.nxv8bf16(
+ <vscale x 8 x float> poison,
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwadd_mask_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfwadd.vv v8, v12, v14, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.nxv16bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwadd_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmv4r.v v16, v12
+; CHECK-NEXT: vmv4r.v v20, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwadd.vv v8, v20, v16
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.nxv16bf16(
+ <vscale x 16 x float> poison,
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwadd_mask_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfwadd.vv v8, v16, v20, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwadd_vf_nxv1f32_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vf_nxv1f32_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfwadd.vf v9, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.bf16(
+ <vscale x 1 x float> poison,
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwadd_mask_vf_nxv1f32_nxv1bf16_bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv1f32_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfwadd.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwadd_vf_nxv2f32_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vf_nxv2f32_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfwadd.vf v9, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.bf16(
+ <vscale x 2 x float> poison,
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwadd_mask_vf_nxv2f32_nxv2bf16_bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv2f32_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfwadd.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwadd_vf_nxv4f32_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vf_nxv4f32_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v10, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwadd.vf v8, v10, fa0
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.bf16(
+ <vscale x 4 x float> poison,
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwadd_mask_vf_nxv4f32_nxv4bf16_bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv4f32_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfwadd.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwadd_vf_nxv8f32_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vf_nxv8f32_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmv2r.v v12, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwadd.vf v8, v12, fa0
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.bf16(
+ <vscale x 8 x float> poison,
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwadd_mask_vf_nxv8f32_nxv8bf16_bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv8f32_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfwadd.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwadd_vf_nxv16f32_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vf_nxv16f32_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmv4r.v v16, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwadd.vf v8, v16, fa0
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.bf16(
+ <vscale x 16 x float> poison,
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwadd_mask_vf_nxv16f32_nxv16bf16_bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv16f32_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfwadd.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x float> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwadd-w-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfwadd-w-bf.ll
new file mode 100644
index 0000000..c5417e8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwadd-w-bf.ll
@@ -0,0 +1,773 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.nxv1bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwadd.w_wv_nxv1f32_nxv1f32_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv1f32_nxv1f32_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfwadd.wv v8, v8, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.nxv1bf16(
+ <vscale x 1 x float> poison,
+ <vscale x 1 x float> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwadd.w_mask_wv_nxv1f32_nxv1f32_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv1f32_nxv1f32_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfwadd.wv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x float> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.nxv2bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwadd.w_wv_nxv2f32_nxv2f32_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv2f32_nxv2f32_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfwadd.wv v8, v8, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.nxv2bf16(
+ <vscale x 2 x float> poison,
+ <vscale x 2 x float> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwadd.w_mask_wv_nxv2f32_nxv2f32_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv2f32_nxv2f32_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfwadd.wv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x float> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwadd.w_wv_nxv4f32_nxv4f32_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv4f32_nxv4f32_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfwadd.wv v8, v8, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4bf16(
+ <vscale x 4 x float> poison,
+ <vscale x 4 x float> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwadd.w_mask_wv_nxv4f32_nxv4f32_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv4f32_nxv4f32_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfwadd.wv v8, v10, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x float> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwadd.w_wv_nxv8f32_nxv8f32_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv8f32_nxv8f32_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfwadd.wv v8, v8, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8bf16(
+ <vscale x 8 x float> poison,
+ <vscale x 8 x float> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwadd.w_mask_wv_nxv8f32_nxv8f32_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv8f32_nxv8f32_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfwadd.wv v8, v12, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x float> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.nxv16bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwadd.w_wv_nxv16f32_nxv16f32_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv16f32_nxv16f32_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfwadd.wv v8, v8, v16
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.nxv16bf16(
+ <vscale x 16 x float> poison,
+ <vscale x 16 x float> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwadd.w_mask_wv_nxv16f32_nxv16f32_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv16f32_nxv16f32_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl4re16.v v24, (a0)
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vsetvli zero, a1, e16alt, m4, ta, mu
+; CHECK-NEXT: vfwadd.wv v8, v16, v24, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x float> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x float>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwadd.w_wf_nxv1f32_nxv1f32_bf16(<vscale x 1 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv1f32_nxv1f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfwadd.wf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.bf16(
+ <vscale x 1 x float> poison,
+ <vscale x 1 x float> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x float>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwadd.w_mask_wf_nxv1f32_nxv1f32_bf16(<vscale x 1 x float> %0, <vscale x 1 x float> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv1f32_nxv1f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfwadd.wf v8, v9, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x float> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x float>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwadd.w_wf_nxv2f32_nxv2f32_bf16(<vscale x 2 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv2f32_nxv2f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfwadd.wf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.bf16(
+ <vscale x 2 x float> poison,
+ <vscale x 2 x float> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x float>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwadd.w_mask_wf_nxv2f32_nxv2f32_bf16(<vscale x 2 x float> %0, <vscale x 2 x float> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv2f32_nxv2f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfwadd.wf v8, v9, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x float> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x float>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwadd.w_wf_nxv4f32_nxv4f32_bf16(<vscale x 4 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv4f32_nxv4f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfwadd.wf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.bf16(
+ <vscale x 4 x float> poison,
+ <vscale x 4 x float> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x float>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwadd.w_mask_wf_nxv4f32_nxv4f32_bf16(<vscale x 4 x float> %0, <vscale x 4 x float> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv4f32_nxv4f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfwadd.wf v8, v10, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x float> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x float>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwadd.w_wf_nxv8f32_nxv8f32_bf16(<vscale x 8 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv8f32_nxv8f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfwadd.wf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.bf16(
+ <vscale x 8 x float> poison,
+ <vscale x 8 x float> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x float>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwadd.w_mask_wf_nxv8f32_nxv8f32_bf16(<vscale x 8 x float> %0, <vscale x 8 x float> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv8f32_nxv8f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfwadd.wf v8, v12, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x float> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x float>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwadd.w_wf_nxv16f32_nxv16f32_bf16(<vscale x 16 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv16f32_nxv16f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfwadd.wf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.bf16(
+ <vscale x 16 x float> poison,
+ <vscale x 16 x float> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x float>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwadd.w_mask_wf_nxv16f32_nxv16f32_bf16(<vscale x 16 x float> %0, <vscale x 16 x float> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv16f32_nxv16f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfwadd.wf v8, v16, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x float> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float> @intrinsic_vfwadd.w_mask_wv_tie_nxv1f32_nxv1f32_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv1f32_nxv1f32_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfwadd.wv v8, v8, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x float> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float> @intrinsic_vfwadd.w_mask_wv_tie_nxv2f32_nxv2f32_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv2f32_nxv2f32_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfwadd.wv v8, v8, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x float> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float> @intrinsic_vfwadd.w_mask_wv_tie_nxv4f32_nxv4f32_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv4f32_nxv4f32_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfwadd.wv v8, v8, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x float> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float> @intrinsic_vfwadd.w_mask_wv_tie_nxv8f32_nxv8f32_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv8f32_nxv8f32_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfwadd.wv v8, v8, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x float> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float> @intrinsic_vfwadd.w_mask_wv_tie_nxv16f32_nxv16f32_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv16f32_nxv16f32_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfwadd.wv v8, v8, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x float> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float> @intrinsic_vfwadd.w_mask_wf_tie_nxv1f32_nxv1f32_bf16(<vscale x 1 x float> %0, bfloat %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv1f32_nxv1f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfwadd.wf v8, v8, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x float> %0,
+ bfloat %1,
+ <vscale x 1 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float> @intrinsic_vfwadd.w_mask_wf_tie_nxv2f32_nxv2f32_bf16(<vscale x 2 x float> %0, bfloat %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv2f32_nxv2f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfwadd.wf v8, v8, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x float> %0,
+ bfloat %1,
+ <vscale x 2 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float> @intrinsic_vfwadd.w_mask_wf_tie_nxv4f32_nxv4f32_bf16(<vscale x 4 x float> %0, bfloat %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv4f32_nxv4f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfwadd.wf v8, v8, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x float> %0,
+ bfloat %1,
+ <vscale x 4 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float> @intrinsic_vfwadd.w_mask_wf_tie_nxv8f32_nxv8f32_bf16(<vscale x 8 x float> %0, bfloat %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv8f32_nxv8f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfwadd.wf v8, v8, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x float> %0,
+ bfloat %1,
+ <vscale x 8 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float> @intrinsic_vfwadd.w_mask_wf_tie_nxv16f32_nxv16f32_bf16(<vscale x 16 x float> %0, bfloat %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv16f32_nxv16f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfwadd.wf v8, v8, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x float> %0,
+ bfloat %1,
+ <vscale x 16 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float> @intrinsic_vfwadd.w_wv_untie_nxv1f32_nxv1f32_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x float> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv1f32_nxv1f32_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfwadd.wv v10, v9, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.nxv1bf16(
+ <vscale x 1 x float> poison,
+ <vscale x 1 x float> %1,
+ <vscale x 1 x bfloat> %0,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float> @intrinsic_vfwadd.w_wv_untie_nxv2f32_nxv2f32_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv2f32_nxv2f32_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfwadd.wv v10, v9, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.nxv2bf16(
+ <vscale x 2 x float> poison,
+ <vscale x 2 x float> %1,
+ <vscale x 2 x bfloat> %0,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float> @intrinsic_vfwadd.w_wv_untie_nxv4f32_nxv4f32_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv4f32_nxv4f32_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v12, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwadd.wv v8, v10, v12
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4bf16(
+ <vscale x 4 x float> poison,
+ <vscale x 4 x float> %1,
+ <vscale x 4 x bfloat> %0,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float> @intrinsic_vfwadd.w_wv_untie_nxv8f32_nxv8f32_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv8f32_nxv8f32_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmv2r.v v16, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwadd.wv v8, v12, v16
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8bf16(
+ <vscale x 8 x float> poison,
+ <vscale x 8 x float> %1,
+ <vscale x 8 x bfloat> %0,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 8 x float> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-bf-x.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-bf-x.ll
new file mode 100644
index 0000000..b7df45b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-bf-x.ll
@@ -0,0 +1,264 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv1bf16.nxv1i8(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i8>,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfwcvt_f.x.v_nxv1bf16_nxv1i8(<vscale x 1 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv1bf16_nxv1i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf8, ta, ma
+; CHECK-NEXT: vfwcvt.f.x.v v9, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv1bf16.nxv1i8(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x i8> %0,
+ iXLen %1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv1bf16.nxv1i8(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i8>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfwcvt_mask_f.x.v_nxv1bf16_nxv1i8(<vscale x 1 x bfloat> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.x.v_nxv1bf16_nxv1i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf8, ta, mu
+; CHECK-NEXT: vfwcvt.f.x.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv1bf16.nxv1i8(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x i8> %1,
+ <vscale x 1 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv2bf16.nxv2i8(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i8>,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfwcvt_f.x.v_nxv2bf16_nxv2i8(<vscale x 2 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv2bf16_nxv2i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf4, ta, ma
+; CHECK-NEXT: vfwcvt.f.x.v v9, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv2bf16.nxv2i8(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x i8> %0,
+ iXLen %1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv2bf16.nxv2i8(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i8>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfwcvt_mask_f.x.v_nxv2bf16_nxv2i8(<vscale x 2 x bfloat> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.x.v_nxv2bf16_nxv2i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf4, ta, mu
+; CHECK-NEXT: vfwcvt.f.x.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv2bf16.nxv2i8(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x i8> %1,
+ <vscale x 2 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv4bf16.nxv4i8(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i8>,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfwcvt_f.x.v_nxv4bf16_nxv4i8(<vscale x 4 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv4bf16_nxv4i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf2, ta, ma
+; CHECK-NEXT: vfwcvt.f.x.v v9, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv4bf16.nxv4i8(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x i8> %0,
+ iXLen %1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv4bf16.nxv4i8(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i8>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfwcvt_mask_f.x.v_nxv4bf16_nxv4i8(<vscale x 4 x bfloat> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.x.v_nxv4bf16_nxv4i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf2, ta, mu
+; CHECK-NEXT: vfwcvt.f.x.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv4bf16.nxv4i8(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x i8> %1,
+ <vscale x 4 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv8bf16.nxv8i8(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i8>,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfwcvt_f.x.v_nxv8bf16_nxv8i8(<vscale x 8 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv8bf16_nxv8i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v10, v8
+; CHECK-NEXT: vfwcvt.f.x.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv8bf16.nxv8i8(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x i8> %0,
+ iXLen %1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv8bf16.nxv8i8(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i8>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfwcvt_mask_f.x.v_nxv8bf16_nxv8i8(<vscale x 8 x bfloat> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.x.v_nxv8bf16_nxv8i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m1, ta, mu
+; CHECK-NEXT: vfwcvt.f.x.v v8, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv8bf16.nxv8i8(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x i8> %1,
+ <vscale x 8 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv16bf16.nxv16i8(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i8>,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfwcvt_f.x.v_nxv16bf16_nxv16i8(<vscale x 16 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv16bf16_nxv16i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m2, ta, ma
+; CHECK-NEXT: vmv2r.v v12, v8
+; CHECK-NEXT: vfwcvt.f.x.v v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv16bf16.nxv16i8(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x i8> %0,
+ iXLen %1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv16bf16.nxv16i8(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i8>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfwcvt_mask_f.x.v_nxv16bf16_nxv16i8(<vscale x 16 x bfloat> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.x.v_nxv16bf16_nxv16i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m2, ta, mu
+; CHECK-NEXT: vfwcvt.f.x.v v8, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv16bf16.nxv16i8(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x i8> %1,
+ <vscale x 16 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv32bf16.nxv32i8(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i8>,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfwcvt_f.x.v_nxv32bf16_nxv32i8(<vscale x 32 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv32bf16_nxv32i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m4, ta, ma
+; CHECK-NEXT: vmv4r.v v16, v8
+; CHECK-NEXT: vfwcvt.f.x.v v8, v16
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv32bf16.nxv32i8(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x i8> %0,
+ iXLen %1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv32bf16.nxv32i8(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i8>,
+ <vscale x 32 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfwcvt_mask_f.x.v_nxv32bf16_nxv32i8(<vscale x 32 x bfloat> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.x.v_nxv32bf16_nxv32i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m4, ta, mu
+; CHECK-NEXT: vfwcvt.f.x.v v8, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv32bf16.nxv32i8(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x i8> %1,
+ <vscale x 32 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-bf-xu.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-bf-xu.ll
new file mode 100644
index 0000000..c370261
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-bf-xu.ll
@@ -0,0 +1,264 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv1bf16.nxv1i8(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i8>,
+ iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfwcvt_f.xu.v_nxv1bf16_nxv1i8(<vscale x 1 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv1bf16_nxv1i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf8, ta, ma
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv1bf16.nxv1i8(
+ <vscale x 1 x bfloat> poison,
+ <vscale x 1 x i8> %0,
+ iXLen %1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv1bf16.nxv1i8(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i8>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfwcvt_mask_f.xu.v_nxv1bf16_nxv1i8(<vscale x 1 x bfloat> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.xu.v_nxv1bf16_nxv1i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf8, ta, mu
+; CHECK-NEXT: vfwcvt.f.xu.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv1bf16.nxv1i8(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x i8> %1,
+ <vscale x 1 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv2bf16.nxv2i8(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i8>,
+ iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfwcvt_f.xu.v_nxv2bf16_nxv2i8(<vscale x 2 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv2bf16_nxv2i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf4, ta, ma
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv2bf16.nxv2i8(
+ <vscale x 2 x bfloat> poison,
+ <vscale x 2 x i8> %0,
+ iXLen %1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv2bf16.nxv2i8(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i8>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfwcvt_mask_f.xu.v_nxv2bf16_nxv2i8(<vscale x 2 x bfloat> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.xu.v_nxv2bf16_nxv2i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf4, ta, mu
+; CHECK-NEXT: vfwcvt.f.xu.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv2bf16.nxv2i8(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x i8> %1,
+ <vscale x 2 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv4bf16.nxv4i8(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i8>,
+ iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfwcvt_f.xu.v_nxv4bf16_nxv4i8(<vscale x 4 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv4bf16_nxv4i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf2, ta, ma
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv4bf16.nxv4i8(
+ <vscale x 4 x bfloat> poison,
+ <vscale x 4 x i8> %0,
+ iXLen %1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv4bf16.nxv4i8(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i8>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfwcvt_mask_f.xu.v_nxv4bf16_nxv4i8(<vscale x 4 x bfloat> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.xu.v_nxv4bf16_nxv4i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, mf2, ta, mu
+; CHECK-NEXT: vfwcvt.f.xu.v v8, v9, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv4bf16.nxv4i8(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x i8> %1,
+ <vscale x 4 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv8bf16.nxv8i8(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i8>,
+ iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfwcvt_f.xu.v_nxv8bf16_nxv8i8(<vscale x 8 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv8bf16_nxv8i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v10, v8
+; CHECK-NEXT: vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv8bf16.nxv8i8(
+ <vscale x 8 x bfloat> poison,
+ <vscale x 8 x i8> %0,
+ iXLen %1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv8bf16.nxv8i8(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i8>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfwcvt_mask_f.xu.v_nxv8bf16_nxv8i8(<vscale x 8 x bfloat> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.xu.v_nxv8bf16_nxv8i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m1, ta, mu
+; CHECK-NEXT: vfwcvt.f.xu.v v8, v10, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv8bf16.nxv8i8(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x i8> %1,
+ <vscale x 8 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv16bf16.nxv16i8(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i8>,
+ iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfwcvt_f.xu.v_nxv16bf16_nxv16i8(<vscale x 16 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv16bf16_nxv16i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m2, ta, ma
+; CHECK-NEXT: vmv2r.v v12, v8
+; CHECK-NEXT: vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv16bf16.nxv16i8(
+ <vscale x 16 x bfloat> poison,
+ <vscale x 16 x i8> %0,
+ iXLen %1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv16bf16.nxv16i8(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i8>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfwcvt_mask_f.xu.v_nxv16bf16_nxv16i8(<vscale x 16 x bfloat> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.xu.v_nxv16bf16_nxv16i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m2, ta, mu
+; CHECK-NEXT: vfwcvt.f.xu.v v8, v12, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv16bf16.nxv16i8(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x i8> %1,
+ <vscale x 16 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv32bf16.nxv32i8(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i8>,
+ iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfwcvt_f.xu.v_nxv32bf16_nxv32i8(<vscale x 32 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv32bf16_nxv32i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m4, ta, ma
+; CHECK-NEXT: vmv4r.v v16, v8
+; CHECK-NEXT: vfwcvt.f.xu.v v8, v16
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv32bf16.nxv32i8(
+ <vscale x 32 x bfloat> poison,
+ <vscale x 32 x i8> %0,
+ iXLen %1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv32bf16.nxv32i8(
+ <vscale x 32 x bfloat>,
+ <vscale x 32 x i8>,
+ <vscale x 32 x i1>,
+ iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfwcvt_mask_f.xu.v_nxv32bf16_nxv32i8(<vscale x 32 x bfloat> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.xu.v_nxv32bf16_nxv32i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8alt, m4, ta, mu
+; CHECK-NEXT: vfwcvt.f.xu.v v8, v16, v0.t
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv32bf16.nxv32i8(
+ <vscale x 32 x bfloat> %0,
+ <vscale x 32 x i8> %1,
+ <vscale x 32 x i1> %2,
+ iXLen %3, iXLen 1)
+
+ ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmsac-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmsac-bf.ll
new file mode 100644
index 0000000..a3f6678
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmsac-bf.ll
@@ -0,0 +1,506 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwmsac_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfwmsac.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwmsac_mask_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfwmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwmsac_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfwmsac.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwmsac_mask_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfwmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwmsac_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfwmsac.vv v8, v10, v11
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwmsac_mask_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfwmsac.vv v8, v10, v11, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwmsac_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfwmsac.vv v8, v12, v14
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwmsac_mask_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfwmsac.vv v8, v12, v14, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwmsac_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfwmsac.vv v8, v16, v20
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwmsac_mask_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfwmsac.vv v8, v16, v20, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.bf16(
+ <vscale x 1 x float>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwmsac_vf_nxv1f32_bf16_nxv1bf16(<vscale x 1 x float> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv1f32_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfwmsac.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.bf16(
+ <vscale x 1 x float> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16(
+ <vscale x 1 x float>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwmsac_mask_vf_nxv1f32_bf16_nxv1bf16(<vscale x 1 x float> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv1f32_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfwmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16(
+ <vscale x 1 x float> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.bf16(
+ <vscale x 2 x float>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwmsac_vf_nxv2f32_bf16_nxv2bf16(<vscale x 2 x float> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv2f32_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfwmsac.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.bf16(
+ <vscale x 2 x float> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16(
+ <vscale x 2 x float>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwmsac_mask_vf_nxv2f32_bf16_nxv2bf16(<vscale x 2 x float> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv2f32_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfwmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16(
+ <vscale x 2 x float> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.bf16(
+ <vscale x 4 x float>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwmsac_vf_nxv4f32_bf16_nxv4bf16(<vscale x 4 x float> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv4f32_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfwmsac.vf v8, fa0, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.bf16(
+ <vscale x 4 x float> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16(
+ <vscale x 4 x float>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwmsac_mask_vf_nxv4f32_bf16_nxv4bf16(<vscale x 4 x float> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv4f32_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfwmsac.vf v8, fa0, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16(
+ <vscale x 4 x float> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.bf16(
+ <vscale x 8 x float>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwmsac_vf_nxv8f32_bf16_nxv8bf16(<vscale x 8 x float> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv8f32_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfwmsac.vf v8, fa0, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.bf16(
+ <vscale x 8 x float> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16(
+ <vscale x 8 x float>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwmsac_mask_vf_nxv8f32_bf16_nxv8bf16(<vscale x 8 x float> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv8f32_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfwmsac.vf v8, fa0, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16(
+ <vscale x 8 x float> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.bf16(
+ <vscale x 16 x float>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwmsac_vf_nxv16f32_bf16_nxv16bf16(<vscale x 16 x float> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv16f32_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfwmsac.vf v8, fa0, v16
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.bf16(
+ <vscale x 16 x float> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16(
+ <vscale x 16 x float>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwmsac_mask_vf_nxv16f32_bf16_nxv16bf16(<vscale x 16 x float> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv16f32_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfwmsac.vf v8, fa0, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16(
+ <vscale x 16 x float> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 16 x float> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmul-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmul-bf.ll
new file mode 100644
index 0000000..577b93a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmul-bf.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.nxv1bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwmul_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfwmul.vv v10, v8, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.nxv1bf16(
+ <vscale x 1 x float> poison,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwmul_mask_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfwmul.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.nxv2bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwmul_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfwmul.vv v10, v8, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.nxv2bf16(
+ <vscale x 2 x float> poison,
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwmul_mask_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfwmul.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.nxv4bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwmul_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v10, v9
+; CHECK-NEXT: vmv1r.v v11, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwmul.vv v8, v11, v10
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.nxv4bf16(
+ <vscale x 4 x float> poison,
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwmul_mask_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfwmul.vv v8, v10, v11, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.nxv8bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwmul_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmv2r.v v12, v10
+; CHECK-NEXT: vmv2r.v v14, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwmul.vv v8, v14, v12
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.nxv8bf16(
+ <vscale x 8 x float> poison,
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwmul_mask_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfwmul.vv v8, v12, v14, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.nxv16bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwmul_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmv4r.v v16, v12
+; CHECK-NEXT: vmv4r.v v20, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwmul.vv v8, v20, v16
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.nxv16bf16(
+ <vscale x 16 x float> poison,
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwmul_mask_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfwmul.vv v8, v16, v20, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwmul_vf_nxv1f32_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vf_nxv1f32_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfwmul.vf v9, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.bf16(
+ <vscale x 1 x float> poison,
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwmul_mask_vf_nxv1f32_nxv1bf16_bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv1f32_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfwmul.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwmul_vf_nxv2f32_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vf_nxv2f32_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfwmul.vf v9, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.bf16(
+ <vscale x 2 x float> poison,
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwmul_mask_vf_nxv2f32_nxv2bf16_bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv2f32_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfwmul.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwmul_vf_nxv4f32_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vf_nxv4f32_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v10, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwmul.vf v8, v10, fa0
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.bf16(
+ <vscale x 4 x float> poison,
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwmul_mask_vf_nxv4f32_nxv4bf16_bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv4f32_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfwmul.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwmul_vf_nxv8f32_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vf_nxv8f32_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmv2r.v v12, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwmul.vf v8, v12, fa0
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.bf16(
+ <vscale x 8 x float> poison,
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwmul_mask_vf_nxv8f32_nxv8bf16_bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv8f32_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfwmul.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwmul_vf_nxv16f32_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vf_nxv16f32_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmv4r.v v16, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwmul.vf v8, v16, fa0
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.bf16(
+ <vscale x 16 x float> poison,
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwmul_mask_vf_nxv16f32_nxv16bf16_bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv16f32_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfwmul.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x float> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-bf.ll
new file mode 100644
index 0000000..1e05e4c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-bf.ll
@@ -0,0 +1,506 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwnmacc_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfwnmacc.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwnmacc_mask_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfwnmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwnmacc_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfwnmacc.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwnmacc_mask_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfwnmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwnmacc_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfwnmacc.vv v8, v10, v11
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwnmacc_mask_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfwnmacc.vv v8, v10, v11, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwnmacc_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfwnmacc.vv v8, v12, v14
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwnmacc_mask_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfwnmacc.vv v8, v12, v14, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwnmacc_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfwnmacc.vv v8, v16, v20
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwnmacc_mask_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfwnmacc.vv v8, v16, v20, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.bf16(
+ <vscale x 1 x float>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwnmacc_vf_nxv1f32_bf16_nxv1bf16(<vscale x 1 x float> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv1f32_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfwnmacc.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.bf16(
+ <vscale x 1 x float> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16(
+ <vscale x 1 x float>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwnmacc_mask_vf_nxv1f32_bf16_nxv1bf16(<vscale x 1 x float> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv1f32_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfwnmacc.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16(
+ <vscale x 1 x float> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.bf16(
+ <vscale x 2 x float>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwnmacc_vf_nxv2f32_bf16_nxv2bf16(<vscale x 2 x float> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv2f32_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfwnmacc.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.bf16(
+ <vscale x 2 x float> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16(
+ <vscale x 2 x float>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwnmacc_mask_vf_nxv2f32_bf16_nxv2bf16(<vscale x 2 x float> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv2f32_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfwnmacc.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16(
+ <vscale x 2 x float> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.bf16(
+ <vscale x 4 x float>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwnmacc_vf_nxv4f32_bf16_nxv4bf16(<vscale x 4 x float> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv4f32_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfwnmacc.vf v8, fa0, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.bf16(
+ <vscale x 4 x float> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16(
+ <vscale x 4 x float>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwnmacc_mask_vf_nxv4f32_bf16_nxv4bf16(<vscale x 4 x float> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv4f32_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfwnmacc.vf v8, fa0, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16(
+ <vscale x 4 x float> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.bf16(
+ <vscale x 8 x float>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwnmacc_vf_nxv8f32_bf16_nxv8bf16(<vscale x 8 x float> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv8f32_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfwnmacc.vf v8, fa0, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.bf16(
+ <vscale x 8 x float> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16(
+ <vscale x 8 x float>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwnmacc_mask_vf_nxv8f32_bf16_nxv8bf16(<vscale x 8 x float> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv8f32_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfwnmacc.vf v8, fa0, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16(
+ <vscale x 8 x float> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.bf16(
+ <vscale x 16 x float>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwnmacc_vf_nxv16f32_bf16_nxv16bf16(<vscale x 16 x float> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv16f32_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfwnmacc.vf v8, fa0, v16
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.bf16(
+ <vscale x 16 x float> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16(
+ <vscale x 16 x float>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwnmacc_mask_vf_nxv16f32_bf16_nxv16bf16(<vscale x 16 x float> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv16f32_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfwnmacc.vf v8, fa0, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16(
+ <vscale x 16 x float> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 16 x float> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-bf.ll
new file mode 100644
index 0000000..223ad4f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-bf.ll
@@ -0,0 +1,506 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwnmsac_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfwnmsac.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwnmsac_mask_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfwnmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwnmsac_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfwnmsac.vv v8, v9, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwnmsac_mask_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfwnmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwnmsac_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfwnmsac.vv v8, v10, v11
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwnmsac_mask_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfwnmsac.vv v8, v10, v11, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwnmsac_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfwnmsac.vv v8, v12, v14
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwnmsac_mask_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfwnmsac.vv v8, v12, v14, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwnmsac_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfwnmsac.vv v8, v16, v20
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwnmsac_mask_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfwnmsac.vv v8, v16, v20, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.bf16(
+ <vscale x 1 x float>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwnmsac_vf_nxv1f32_bf16_nxv1bf16(<vscale x 1 x float> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv1f32_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT: vfwnmsac.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.bf16(
+ <vscale x 1 x float> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16(
+ <vscale x 1 x float>,
+ bfloat,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwnmsac_mask_vf_nxv1f32_bf16_nxv1bf16(<vscale x 1 x float> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv1f32_bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT: vfwnmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16(
+ <vscale x 1 x float> %0,
+ bfloat %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.bf16(
+ <vscale x 2 x float>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwnmsac_vf_nxv2f32_bf16_nxv2bf16(<vscale x 2 x float> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv2f32_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT: vfwnmsac.vf v8, fa0, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.bf16(
+ <vscale x 2 x float> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16(
+ <vscale x 2 x float>,
+ bfloat,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwnmsac_mask_vf_nxv2f32_bf16_nxv2bf16(<vscale x 2 x float> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv2f32_bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT: vfwnmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16(
+ <vscale x 2 x float> %0,
+ bfloat %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.bf16(
+ <vscale x 4 x float>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwnmsac_vf_nxv4f32_bf16_nxv4bf16(<vscale x 4 x float> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv4f32_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT: vfwnmsac.vf v8, fa0, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.bf16(
+ <vscale x 4 x float> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16(
+ <vscale x 4 x float>,
+ bfloat,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwnmsac_mask_vf_nxv4f32_bf16_nxv4bf16(<vscale x 4 x float> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv4f32_bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT: vfwnmsac.vf v8, fa0, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16(
+ <vscale x 4 x float> %0,
+ bfloat %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.bf16(
+ <vscale x 8 x float>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwnmsac_vf_nxv8f32_bf16_nxv8bf16(<vscale x 8 x float> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv8f32_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT: vfwnmsac.vf v8, fa0, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.bf16(
+ <vscale x 8 x float> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16(
+ <vscale x 8 x float>,
+ bfloat,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwnmsac_mask_vf_nxv8f32_bf16_nxv8bf16(<vscale x 8 x float> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv8f32_bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT: vfwnmsac.vf v8, fa0, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16(
+ <vscale x 8 x float> %0,
+ bfloat %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.bf16(
+ <vscale x 16 x float>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwnmsac_vf_nxv16f32_bf16_nxv16bf16(<vscale x 16 x float> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv16f32_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT: vfwnmsac.vf v8, fa0, v16
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.bf16(
+ <vscale x 16 x float> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen 0, iXLen %3, iXLen 0)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16(
+ <vscale x 16 x float>,
+ bfloat,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwnmsac_mask_vf_nxv16f32_bf16_nxv16bf16(<vscale x 16 x float> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv16f32_bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT: vfwnmsac.vf v8, fa0, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16(
+ <vscale x 16 x float> %0,
+ bfloat %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 0)
+
+ ret <vscale x 16 x float> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwsub-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfwsub-bf.ll
new file mode 100644
index 0000000..d993e4e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwsub-bf.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.nxv1bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwsub_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfwsub.vv v10, v8, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.nxv1bf16(
+ <vscale x 1 x float> poison,
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwsub_mask_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfwsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.nxv2bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwsub_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfwsub.vv v10, v8, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.nxv2bf16(
+ <vscale x 2 x float> poison,
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwsub_mask_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfwsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.nxv4bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwsub_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v10, v9
+; CHECK-NEXT: vmv1r.v v11, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwsub.vv v8, v11, v10
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.nxv4bf16(
+ <vscale x 4 x float> poison,
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwsub_mask_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfwsub.vv v8, v10, v11, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.nxv8bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwsub_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmv2r.v v12, v10
+; CHECK-NEXT: vmv2r.v v14, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwsub.vv v8, v14, v12
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.nxv8bf16(
+ <vscale x 8 x float> poison,
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwsub_mask_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfwsub.vv v8, v12, v14, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.nxv16bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwsub_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmv4r.v v16, v12
+; CHECK-NEXT: vmv4r.v v20, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwsub.vv v8, v20, v16
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.nxv16bf16(
+ <vscale x 16 x float> poison,
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwsub_mask_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfwsub.vv v8, v16, v20, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwsub_vf_nxv1f32_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vf_nxv1f32_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfwsub.vf v9, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.bf16(
+ <vscale x 1 x float> poison,
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwsub_mask_vf_nxv1f32_nxv1bf16_bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv1f32_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfwsub.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwsub_vf_nxv2f32_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vf_nxv2f32_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfwsub.vf v9, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.bf16(
+ <vscale x 2 x float> poison,
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwsub_mask_vf_nxv2f32_nxv2bf16_bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv2f32_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfwsub.vf v8, v9, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwsub_vf_nxv4f32_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vf_nxv4f32_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v10, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwsub.vf v8, v10, fa0
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.bf16(
+ <vscale x 4 x float> poison,
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwsub_mask_vf_nxv4f32_nxv4bf16_bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv4f32_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfwsub.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwsub_vf_nxv8f32_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vf_nxv8f32_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmv2r.v v12, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwsub.vf v8, v12, fa0
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.bf16(
+ <vscale x 8 x float> poison,
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwsub_mask_vf_nxv8f32_nxv8bf16_bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv8f32_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfwsub.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwsub_vf_nxv16f32_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vf_nxv16f32_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmv4r.v v16, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwsub.vf v8, v16, fa0
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.bf16(
+ <vscale x 16 x float> poison,
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwsub_mask_vf_nxv16f32_nxv16bf16_bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv16f32_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfwsub.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x float> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwsub-w-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfwsub-w-bf.ll
new file mode 100644
index 0000000..b22899a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwsub-w-bf.ll
@@ -0,0 +1,773 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.nxv1bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwsub.w_wv_nxv1f32_nxv1f32_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv1f32_nxv1f32_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfwsub.wv v8, v8, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.nxv1bf16(
+ <vscale x 1 x float> poison,
+ <vscale x 1 x float> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x float>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwsub.w_mask_wv_nxv1f32_nxv1f32_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv1f32_nxv1f32_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfwsub.wv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x float> %1,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.nxv2bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwsub.w_wv_nxv2f32_nxv2f32_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv2f32_nxv2f32_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfwsub.wv v8, v8, v9
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.nxv2bf16(
+ <vscale x 2 x float> poison,
+ <vscale x 2 x float> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x float>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwsub.w_mask_wv_nxv2f32_nxv2f32_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv2f32_nxv2f32_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfwsub.wv v8, v9, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x float> %1,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwsub.w_wv_nxv4f32_nxv4f32_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv4f32_nxv4f32_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfwsub.wv v8, v8, v10
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4bf16(
+ <vscale x 4 x float> poison,
+ <vscale x 4 x float> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x float>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwsub.w_mask_wv_nxv4f32_nxv4f32_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv4f32_nxv4f32_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfwsub.wv v8, v10, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x float> %1,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwsub.w_wv_nxv8f32_nxv8f32_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv8f32_nxv8f32_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfwsub.wv v8, v8, v12
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8bf16(
+ <vscale x 8 x float> poison,
+ <vscale x 8 x float> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x float>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwsub.w_mask_wv_nxv8f32_nxv8f32_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv8f32_nxv8f32_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfwsub.wv v8, v12, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x float> %1,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.nxv16bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwsub.w_wv_nxv16f32_nxv16f32_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv16f32_nxv16f32_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfwsub.wv v8, v8, v16
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.nxv16bf16(
+ <vscale x 16 x float> poison,
+ <vscale x 16 x float> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x float>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwsub.w_mask_wv_nxv16f32_nxv16f32_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv16f32_nxv16f32_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl4re16.v v24, (a0)
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vsetvli zero, a1, e16alt, m4, ta, mu
+; CHECK-NEXT: vfwsub.wv v8, v16, v24, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x float> %1,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x float>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwsub.w_wf_nxv1f32_nxv1f32_bf16(<vscale x 1 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv1f32_nxv1f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfwsub.wf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.bf16(
+ <vscale x 1 x float> poison,
+ <vscale x 1 x float> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16(
+ <vscale x 1 x float>,
+ <vscale x 1 x float>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwsub.w_mask_wf_nxv1f32_nxv1f32_bf16(<vscale x 1 x float> %0, <vscale x 1 x float> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv1f32_nxv1f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfwsub.wf v8, v9, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x float> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x float>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwsub.w_wf_nxv2f32_nxv2f32_bf16(<vscale x 2 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv2f32_nxv2f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfwsub.wf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.bf16(
+ <vscale x 2 x float> poison,
+ <vscale x 2 x float> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16(
+ <vscale x 2 x float>,
+ <vscale x 2 x float>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwsub.w_mask_wf_nxv2f32_nxv2f32_bf16(<vscale x 2 x float> %0, <vscale x 2 x float> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv2f32_nxv2f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfwsub.wf v8, v9, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x float> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x float>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwsub.w_wf_nxv4f32_nxv4f32_bf16(<vscale x 4 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv4f32_nxv4f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vfwsub.wf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.bf16(
+ <vscale x 4 x float> poison,
+ <vscale x 4 x float> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16(
+ <vscale x 4 x float>,
+ <vscale x 4 x float>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwsub.w_mask_wf_nxv4f32_nxv4f32_bf16(<vscale x 4 x float> %0, <vscale x 4 x float> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv4f32_nxv4f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfwsub.wf v8, v10, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x float> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x float>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwsub.w_wf_nxv8f32_nxv8f32_bf16(<vscale x 8 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv8f32_nxv8f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vfwsub.wf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.bf16(
+ <vscale x 8 x float> poison,
+ <vscale x 8 x float> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16(
+ <vscale x 8 x float>,
+ <vscale x 8 x float>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwsub.w_mask_wf_nxv8f32_nxv8f32_bf16(<vscale x 8 x float> %0, <vscale x 8 x float> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv8f32_nxv8f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfwsub.wf v8, v12, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x float> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x float>,
+ bfloat,
+ iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwsub.w_wf_nxv16f32_nxv16f32_bf16(<vscale x 16 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv16f32_nxv16f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vfwsub.wf v8, v8, fa0
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.bf16(
+ <vscale x 16 x float> poison,
+ <vscale x 16 x float> %0,
+ bfloat %1,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16(
+ <vscale x 16 x float>,
+ <vscale x 16 x float>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwsub.w_mask_wf_nxv16f32_nxv16f32_bf16(<vscale x 16 x float> %0, <vscale x 16 x float> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv16f32_nxv16f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfwsub.wf v8, v16, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x float> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen 0, iXLen %4, iXLen 1)
+
+ ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float> @intrinsic_vfwsub.w_mask_wv_tie_nxv1f32_nxv1f32_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv1f32_nxv1f32_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfwsub.wv v8, v8, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x float> %0,
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float> @intrinsic_vfwsub.w_mask_wv_tie_nxv2f32_nxv2f32_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv2f32_nxv2f32_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfwsub.wv v8, v8, v9, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x float> %0,
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float> @intrinsic_vfwsub.w_mask_wv_tie_nxv4f32_nxv4f32_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv4f32_nxv4f32_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfwsub.wv v8, v8, v10, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x float> %0,
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float> @intrinsic_vfwsub.w_mask_wv_tie_nxv8f32_nxv8f32_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv8f32_nxv8f32_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfwsub.wv v8, v8, v12, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x float> %0,
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float> @intrinsic_vfwsub.w_mask_wv_tie_nxv16f32_nxv16f32_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv16f32_nxv16f32_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfwsub.wv v8, v8, v16, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x float> %0,
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float> @intrinsic_vfwsub.w_mask_wf_tie_nxv1f32_nxv1f32_bf16(<vscale x 1 x float> %0, bfloat %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv1f32_nxv1f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vfwsub.wf v8, v8, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16(
+ <vscale x 1 x float> %0,
+ <vscale x 1 x float> %0,
+ bfloat %1,
+ <vscale x 1 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float> @intrinsic_vfwsub.w_mask_wf_tie_nxv2f32_nxv2f32_bf16(<vscale x 2 x float> %0, bfloat %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv2f32_nxv2f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vfwsub.wf v8, v8, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16(
+ <vscale x 2 x float> %0,
+ <vscale x 2 x float> %0,
+ bfloat %1,
+ <vscale x 2 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float> @intrinsic_vfwsub.w_mask_wf_tie_nxv4f32_nxv4f32_bf16(<vscale x 4 x float> %0, bfloat %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv4f32_nxv4f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vfwsub.wf v8, v8, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16(
+ <vscale x 4 x float> %0,
+ <vscale x 4 x float> %0,
+ bfloat %1,
+ <vscale x 4 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float> @intrinsic_vfwsub.w_mask_wf_tie_nxv8f32_nxv8f32_bf16(<vscale x 8 x float> %0, bfloat %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv8f32_nxv8f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vfwsub.wf v8, v8, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16(
+ <vscale x 8 x float> %0,
+ <vscale x 8 x float> %0,
+ bfloat %1,
+ <vscale x 8 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float> @intrinsic_vfwsub.w_mask_wf_tie_nxv16f32_nxv16f32_bf16(<vscale x 16 x float> %0, bfloat %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv16f32_nxv16f32_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vfwsub.wf v8, v8, fa0, v0.t
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16(
+ <vscale x 16 x float> %0,
+ <vscale x 16 x float> %0,
+ bfloat %1,
+ <vscale x 16 x i1> %2,
+ iXLen 0, iXLen %3, iXLen 1)
+
+ ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float> @intrinsic_vfwsub.w_wv_untie_nxv1f32_nxv1f32_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x float> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv1f32_nxv1f32_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vfwsub.wv v10, v9, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.nxv1bf16(
+ <vscale x 1 x float> poison,
+ <vscale x 1 x float> %1,
+ <vscale x 1 x bfloat> %0,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float> @intrinsic_vfwsub.w_wv_untie_nxv2f32_nxv2f32_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv2f32_nxv2f32_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsrmi a1, 0
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vfwsub.wv v10, v9, v8
+; CHECK-NEXT: fsrm a1
+; CHECK-NEXT: vmv1r.v v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.nxv2bf16(
+ <vscale x 2 x float> poison,
+ <vscale x 2 x float> %1,
+ <vscale x 2 x bfloat> %0,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float> @intrinsic_vfwsub.w_wv_untie_nxv4f32_nxv4f32_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv4f32_nxv4f32_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v12, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwsub.wv v8, v10, v12
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4bf16(
+ <vscale x 4 x float> poison,
+ <vscale x 4 x float> %1,
+ <vscale x 4 x bfloat> %0,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float> @intrinsic_vfwsub.w_wv_untie_nxv8f32_nxv8f32_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv8f32_nxv8f32_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmv2r.v v16, v8
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfwsub.wv v8, v12, v16
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8bf16(
+ <vscale x 8 x float> poison,
+ <vscale x 8 x float> %1,
+ <vscale x 8 x bfloat> %0,
+ iXLen 0, iXLen %2)
+
+ ret <vscale x 8 x float> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfeq-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vmfeq-bf.ll
new file mode 100644
index 0000000..9bd859b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfeq-bf.ll
@@ -0,0 +1,496 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfeq.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfeq_vv_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vv_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfeq.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1bf16(
+ <vscale x 1 x i1>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfeq_mask_vv_nxv1bf16_nxv1bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v9
+; CHECK-NEXT: vmfeq.vv v11, v9, v10, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 1 x i1> @llvm.riscv.vmfeq.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1bf16(
+ <vscale x 1 x i1> %0,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x bfloat> %3,
+ <vscale x 1 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfeq.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfeq_vv_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vv_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfeq.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2bf16(
+ <vscale x 2 x i1>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfeq_mask_vv_nxv2bf16_nxv2bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v9
+; CHECK-NEXT: vmfeq.vv v11, v9, v10, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 2 x i1> @llvm.riscv.vmfeq.nxv2bf16(
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2bf16(
+ <vscale x 2 x i1> %0,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x bfloat> %3,
+ <vscale x 2 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfeq_vv_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vv_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4bf16(
+ <vscale x 4 x i1>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfeq_mask_vv_nxv4bf16_nxv4bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v9
+; CHECK-NEXT: vmfeq.vv v11, v9, v10, v0.t
+; CHECK-NEXT: vmv.v.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4bf16(
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4bf16(
+ <vscale x 4 x i1> %0,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x bfloat> %3,
+ <vscale x 4 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfeq_vv_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vv_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8bf16(
+ <vscale x 8 x i1>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfeq_mask_vv_nxv8bf16_nxv8bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v14, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v10
+; CHECK-NEXT: vmfeq.vv v14, v10, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v14
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8bf16(
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8bf16(
+ <vscale x 8 x i1> %0,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x bfloat> %3,
+ <vscale x 8 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfeq.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfeq_vv_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vv_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfeq.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16bf16(
+ <vscale x 16 x i1>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfeq_mask_vv_nxv16bf16_nxv16bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v20, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v12
+; CHECK-NEXT: vmfeq.vv v20, v12, v16, v0.t
+; CHECK-NEXT: vmv1r.v v0, v20
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 16 x i1> @llvm.riscv.vmfeq.nxv16bf16(
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16bf16(
+ <vscale x 16 x i1> %0,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x bfloat> %3,
+ <vscale x 16 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfeq.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfeq_vf_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vf_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfeq.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1bf16.bf16(
+ <vscale x 1 x i1>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfeq_mask_vf_nxv1bf16_bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vf_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfeq.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1bf16.bf16(
+ <vscale x 1 x i1> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfeq.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfeq_vf_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vf_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfeq.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2bf16.bf16(
+ <vscale x 2 x i1>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfeq_mask_vf_nxv2bf16_bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vf_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfeq.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2bf16.bf16(
+ <vscale x 2 x i1> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfeq_vf_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vf_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4bf16.bf16(
+ <vscale x 4 x i1>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfeq_mask_vf_nxv4bf16_bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vf_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfeq.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4bf16.bf16(
+ <vscale x 4 x i1> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfeq_vf_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vf_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8bf16.bf16(
+ <vscale x 8 x i1>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfeq_mask_vf_nxv8bf16_bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vf_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmfeq.vf v11, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8bf16.bf16(
+ <vscale x 8 x i1> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfeq.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfeq_vf_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vf_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfeq.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16bf16.bf16(
+ <vscale x 16 x i1>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfeq_mask_vf_nxv16bf16_bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vf_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v13, v0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmfeq.vf v13, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v13
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16bf16.bf16(
+ <vscale x 16 x i1> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 16 x i1> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfge-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vmfge-bf.ll
new file mode 100644
index 0000000..73946dc
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfge-bf.ll
@@ -0,0 +1,496 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfge.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfge_vv_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vv_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v9, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfge.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1bf16(
+ <vscale x 1 x i1>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfge_mask_vv_nxv1bf16_nxv1bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmfle.vv v0, v9, v8
+; CHECK-NEXT: vmfle.vv v11, v10, v9, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 1 x i1> @llvm.riscv.vmfge.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1bf16(
+ <vscale x 1 x i1> %0,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x bfloat> %3,
+ <vscale x 1 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfge.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfge_vv_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vv_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v9, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfge.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2bf16(
+ <vscale x 2 x i1>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfge_mask_vv_nxv2bf16_nxv2bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmfle.vv v0, v9, v8
+; CHECK-NEXT: vmfle.vv v11, v10, v9, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 2 x i1> @llvm.riscv.vmfge.nxv2bf16(
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2bf16(
+ <vscale x 2 x i1> %0,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x bfloat> %3,
+ <vscale x 2 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfge_vv_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vv_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v9, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4bf16(
+ <vscale x 4 x i1>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfge_mask_vv_nxv4bf16_nxv4bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmfle.vv v0, v9, v8
+; CHECK-NEXT: vmfle.vv v11, v10, v9, v0.t
+; CHECK-NEXT: vmv.v.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4bf16(
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4bf16(
+ <vscale x 4 x i1> %0,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x bfloat> %3,
+ <vscale x 4 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfge_vv_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vv_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v10, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8bf16(
+ <vscale x 8 x i1>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfge_mask_vv_nxv8bf16_nxv8bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v14, v0
+; CHECK-NEXT: vmfle.vv v0, v10, v8
+; CHECK-NEXT: vmfle.vv v14, v12, v10, v0.t
+; CHECK-NEXT: vmv1r.v v0, v14
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8bf16(
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8bf16(
+ <vscale x 8 x i1> %0,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x bfloat> %3,
+ <vscale x 8 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfge.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfge_vv_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vv_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v12, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfge.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16bf16(
+ <vscale x 16 x i1>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfge_mask_vv_nxv16bf16_nxv16bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v20, v0
+; CHECK-NEXT: vmfle.vv v0, v12, v8
+; CHECK-NEXT: vmfle.vv v20, v16, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v20
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 16 x i1> @llvm.riscv.vmfge.nxv16bf16(
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16bf16(
+ <vscale x 16 x i1> %0,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x bfloat> %3,
+ <vscale x 16 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfge.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfge_vf_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vf_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vmfge.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfge.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1bf16.bf16(
+ <vscale x 1 x i1>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfge_mask_vf_nxv1bf16_bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vf_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfge.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1bf16.bf16(
+ <vscale x 1 x i1> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfge.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfge_vf_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vf_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vmfge.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfge.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2bf16.bf16(
+ <vscale x 2 x i1>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfge_mask_vf_nxv2bf16_bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vf_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfge.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2bf16.bf16(
+ <vscale x 2 x i1> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfge_vf_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vf_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmfge.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4bf16.bf16(
+ <vscale x 4 x i1>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfge_mask_vf_nxv4bf16_bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vf_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfge.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4bf16.bf16(
+ <vscale x 4 x i1> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfge_vf_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vf_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmfge.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8bf16.bf16(
+ <vscale x 8 x i1>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfge_mask_vf_nxv8bf16_bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vf_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmfge.vf v11, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8bf16.bf16(
+ <vscale x 8 x i1> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfge.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfge_vf_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vf_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmfge.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfge.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16bf16.bf16(
+ <vscale x 16 x i1>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfge_mask_vf_nxv16bf16_bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vf_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v13, v0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmfge.vf v13, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v13
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16bf16.bf16(
+ <vscale x 16 x i1> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 16 x i1> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfgt-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vmfgt-bf.ll
new file mode 100644
index 0000000..fac324c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfgt-bf.ll
@@ -0,0 +1,496 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfgt.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfgt_vv_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vv_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v9, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfgt.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1bf16(
+ <vscale x 1 x i1>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfgt_mask_vv_nxv1bf16_nxv1bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmflt.vv v0, v9, v8
+; CHECK-NEXT: vmflt.vv v11, v10, v9, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 1 x i1> @llvm.riscv.vmfgt.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1bf16(
+ <vscale x 1 x i1> %0,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x bfloat> %3,
+ <vscale x 1 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfgt.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfgt_vv_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vv_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v9, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfgt.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2bf16(
+ <vscale x 2 x i1>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfgt_mask_vv_nxv2bf16_nxv2bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmflt.vv v0, v9, v8
+; CHECK-NEXT: vmflt.vv v11, v10, v9, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 2 x i1> @llvm.riscv.vmfgt.nxv2bf16(
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2bf16(
+ <vscale x 2 x i1> %0,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x bfloat> %3,
+ <vscale x 2 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfgt_vv_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vv_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v9, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4bf16(
+ <vscale x 4 x i1>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfgt_mask_vv_nxv4bf16_nxv4bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmflt.vv v0, v9, v8
+; CHECK-NEXT: vmflt.vv v11, v10, v9, v0.t
+; CHECK-NEXT: vmv.v.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4bf16(
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4bf16(
+ <vscale x 4 x i1> %0,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x bfloat> %3,
+ <vscale x 4 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfgt_vv_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vv_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v10, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8bf16(
+ <vscale x 8 x i1>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfgt_mask_vv_nxv8bf16_nxv8bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v14, v0
+; CHECK-NEXT: vmflt.vv v0, v10, v8
+; CHECK-NEXT: vmflt.vv v14, v12, v10, v0.t
+; CHECK-NEXT: vmv1r.v v0, v14
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8bf16(
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8bf16(
+ <vscale x 8 x i1> %0,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x bfloat> %3,
+ <vscale x 8 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfgt.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfgt_vv_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vv_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v12, v8
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfgt.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16bf16(
+ <vscale x 16 x i1>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfgt_mask_vv_nxv16bf16_nxv16bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v20, v0
+; CHECK-NEXT: vmflt.vv v0, v12, v8
+; CHECK-NEXT: vmflt.vv v20, v16, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v20
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 16 x i1> @llvm.riscv.vmfgt.nxv16bf16(
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16bf16(
+ <vscale x 16 x i1> %0,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x bfloat> %3,
+ <vscale x 16 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfgt.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfgt_vf_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vf_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vmfgt.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfgt.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1bf16.bf16(
+ <vscale x 1 x i1>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfgt_mask_vf_nxv1bf16_bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vf_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1bf16.bf16(
+ <vscale x 1 x i1> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfgt.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfgt_vf_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vf_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vmfgt.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfgt.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2bf16.bf16(
+ <vscale x 2 x i1>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfgt_mask_vf_nxv2bf16_bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vf_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2bf16.bf16(
+ <vscale x 2 x i1> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfgt_vf_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vf_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmfgt.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4bf16.bf16(
+ <vscale x 4 x i1>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfgt_mask_vf_nxv4bf16_bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vf_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4bf16.bf16(
+ <vscale x 4 x i1> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfgt_vf_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vf_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmfgt.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8bf16.bf16(
+ <vscale x 8 x i1>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfgt_mask_vf_nxv8bf16_bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vf_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8bf16.bf16(
+ <vscale x 8 x i1> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfgt.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfgt_vf_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vf_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmfgt.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfgt.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16bf16.bf16(
+ <vscale x 16 x i1>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfgt_mask_vf_nxv16bf16_bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vf_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v13, v0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v13
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16bf16.bf16(
+ <vscale x 16 x i1> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 16 x i1> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfle-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vmfle-bf.ll
new file mode 100644
index 0000000..8356b7b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfle-bf.ll
@@ -0,0 +1,496 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfle.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfle_vv_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vv_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfle.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1bf16(
+ <vscale x 1 x i1>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfle_mask_vv_nxv1bf16_nxv1bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmfle.vv v0, v8, v9
+; CHECK-NEXT: vmfle.vv v11, v9, v10, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 1 x i1> @llvm.riscv.vmfle.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1bf16(
+ <vscale x 1 x i1> %0,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x bfloat> %3,
+ <vscale x 1 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfle.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfle_vv_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vv_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfle.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2bf16(
+ <vscale x 2 x i1>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfle_mask_vv_nxv2bf16_nxv2bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmfle.vv v0, v8, v9
+; CHECK-NEXT: vmfle.vv v11, v9, v10, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 2 x i1> @llvm.riscv.vmfle.nxv2bf16(
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2bf16(
+ <vscale x 2 x i1> %0,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x bfloat> %3,
+ <vscale x 2 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfle_vv_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vv_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4bf16(
+ <vscale x 4 x i1>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfle_mask_vv_nxv4bf16_nxv4bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmfle.vv v0, v8, v9
+; CHECK-NEXT: vmfle.vv v11, v9, v10, v0.t
+; CHECK-NEXT: vmv.v.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4bf16(
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4bf16(
+ <vscale x 4 x i1> %0,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x bfloat> %3,
+ <vscale x 4 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfle_vv_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vv_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8bf16(
+ <vscale x 8 x i1>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfle_mask_vv_nxv8bf16_nxv8bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v14, v0
+; CHECK-NEXT: vmfle.vv v0, v8, v10
+; CHECK-NEXT: vmfle.vv v14, v10, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v14
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8bf16(
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8bf16(
+ <vscale x 8 x i1> %0,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x bfloat> %3,
+ <vscale x 8 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfle.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfle_vv_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vv_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfle.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16bf16(
+ <vscale x 16 x i1>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfle_mask_vv_nxv16bf16_nxv16bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v20, v0
+; CHECK-NEXT: vmfle.vv v0, v8, v12
+; CHECK-NEXT: vmfle.vv v20, v12, v16, v0.t
+; CHECK-NEXT: vmv1r.v v0, v20
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 16 x i1> @llvm.riscv.vmfle.nxv16bf16(
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16bf16(
+ <vscale x 16 x i1> %0,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x bfloat> %3,
+ <vscale x 16 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfle.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfle_vf_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vf_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vmfle.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfle.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1bf16.bf16(
+ <vscale x 1 x i1>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfle_mask_vf_nxv1bf16_bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vf_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfle.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1bf16.bf16(
+ <vscale x 1 x i1> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfle.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfle_vf_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vf_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vmfle.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfle.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2bf16.bf16(
+ <vscale x 2 x i1>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfle_mask_vf_nxv2bf16_bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vf_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfle.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2bf16.bf16(
+ <vscale x 2 x i1> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfle_vf_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vf_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmfle.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4bf16.bf16(
+ <vscale x 4 x i1>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfle_mask_vf_nxv4bf16_bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vf_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfle.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4bf16.bf16(
+ <vscale x 4 x i1> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfle_vf_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vf_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmfle.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8bf16.bf16(
+ <vscale x 8 x i1>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfle_mask_vf_nxv8bf16_bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vf_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmfle.vf v11, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8bf16.bf16(
+ <vscale x 8 x i1> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfle.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfle_vf_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vf_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmfle.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfle.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16bf16.bf16(
+ <vscale x 16 x i1>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfle_mask_vf_nxv16bf16_bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vf_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v13, v0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmfle.vf v13, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v13
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16bf16.bf16(
+ <vscale x 16 x i1> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 16 x i1> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmflt-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vmflt-bf.ll
new file mode 100644
index 0000000..2e1bcc5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vmflt-bf.ll
@@ -0,0 +1,496 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i1> @llvm.riscv.vmflt.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmflt_vv_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vv_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmflt.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1bf16(
+ <vscale x 1 x i1>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmflt_mask_vv_nxv1bf16_nxv1bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmflt.vv v0, v8, v9
+; CHECK-NEXT: vmflt.vv v11, v9, v10, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 1 x i1> @llvm.riscv.vmflt.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1bf16(
+ <vscale x 1 x i1> %0,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x bfloat> %3,
+ <vscale x 1 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmflt.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmflt_vv_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vv_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmflt.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2bf16(
+ <vscale x 2 x i1>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmflt_mask_vv_nxv2bf16_nxv2bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmflt.vv v0, v8, v9
+; CHECK-NEXT: vmflt.vv v11, v9, v10, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 2 x i1> @llvm.riscv.vmflt.nxv2bf16(
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2bf16(
+ <vscale x 2 x i1> %0,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x bfloat> %3,
+ <vscale x 2 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmflt_vv_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vv_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4bf16(
+ <vscale x 4 x i1>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmflt_mask_vv_nxv4bf16_nxv4bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmflt.vv v0, v8, v9
+; CHECK-NEXT: vmflt.vv v11, v9, v10, v0.t
+; CHECK-NEXT: vmv.v.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4bf16(
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4bf16(
+ <vscale x 4 x i1> %0,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x bfloat> %3,
+ <vscale x 4 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmflt_vv_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vv_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8bf16(
+ <vscale x 8 x i1>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmflt_mask_vv_nxv8bf16_nxv8bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v14, v0
+; CHECK-NEXT: vmflt.vv v0, v8, v10
+; CHECK-NEXT: vmflt.vv v14, v10, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v14
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8bf16(
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8bf16(
+ <vscale x 8 x i1> %0,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x bfloat> %3,
+ <vscale x 8 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmflt.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmflt_vv_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vv_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmflt.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16bf16(
+ <vscale x 16 x i1>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmflt_mask_vv_nxv16bf16_nxv16bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v20, v0
+; CHECK-NEXT: vmflt.vv v0, v8, v12
+; CHECK-NEXT: vmflt.vv v20, v12, v16, v0.t
+; CHECK-NEXT: vmv1r.v v0, v20
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 16 x i1> @llvm.riscv.vmflt.nxv16bf16(
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16bf16(
+ <vscale x 16 x i1> %0,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x bfloat> %3,
+ <vscale x 16 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmflt.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmflt_vf_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vf_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vmflt.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmflt.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1bf16.bf16(
+ <vscale x 1 x i1>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmflt_mask_vf_nxv1bf16_bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vf_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1bf16.bf16(
+ <vscale x 1 x i1> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmflt.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmflt_vf_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vf_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vmflt.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmflt.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2bf16.bf16(
+ <vscale x 2 x i1>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmflt_mask_vf_nxv2bf16_bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vf_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2bf16.bf16(
+ <vscale x 2 x i1> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmflt_vf_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vf_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmflt.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4bf16.bf16(
+ <vscale x 4 x i1>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmflt_mask_vf_nxv4bf16_bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vf_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4bf16.bf16(
+ <vscale x 4 x i1> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmflt_vf_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vf_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmflt.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8bf16.bf16(
+ <vscale x 8 x i1>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmflt_mask_vf_nxv8bf16_bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vf_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8bf16.bf16(
+ <vscale x 8 x i1> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmflt.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmflt_vf_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vf_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmflt.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmflt.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16bf16.bf16(
+ <vscale x 16 x i1>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmflt_mask_vf_nxv16bf16_bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vf_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v13, v0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v13
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16bf16.bf16(
+ <vscale x 16 x i1> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 16 x i1> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfne-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vmfne-bf.ll
new file mode 100644
index 0000000..283ffc5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfne-bf.ll
@@ -0,0 +1,496 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfne.nxv1bf16(
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfne_vv_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vv_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfne.nxv1bf16(
+ <vscale x 1 x bfloat> %0,
+ <vscale x 1 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1bf16(
+ <vscale x 1 x i1>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x bfloat>,
+ <vscale x 1 x i1>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfne_mask_vv_nxv1bf16_nxv1bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv1bf16_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmfne.vv v0, v8, v9
+; CHECK-NEXT: vmfne.vv v11, v9, v10, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 1 x i1> @llvm.riscv.vmfne.nxv1bf16(
+ <vscale x 1 x bfloat> %1,
+ <vscale x 1 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1bf16(
+ <vscale x 1 x i1> %0,
+ <vscale x 1 x bfloat> %2,
+ <vscale x 1 x bfloat> %3,
+ <vscale x 1 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfne.nxv2bf16(
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfne_vv_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vv_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfne.nxv2bf16(
+ <vscale x 2 x bfloat> %0,
+ <vscale x 2 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2bf16(
+ <vscale x 2 x i1>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x bfloat>,
+ <vscale x 2 x i1>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfne_mask_vv_nxv2bf16_nxv2bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv2bf16_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmfne.vv v0, v8, v9
+; CHECK-NEXT: vmfne.vv v11, v9, v10, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 2 x i1> @llvm.riscv.vmfne.nxv2bf16(
+ <vscale x 2 x bfloat> %1,
+ <vscale x 2 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2bf16(
+ <vscale x 2 x i1> %0,
+ <vscale x 2 x bfloat> %2,
+ <vscale x 2 x bfloat> %3,
+ <vscale x 2 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4bf16(
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfne_vv_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vv_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v8, v9
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4bf16(
+ <vscale x 4 x bfloat> %0,
+ <vscale x 4 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4bf16(
+ <vscale x 4 x i1>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x bfloat>,
+ <vscale x 4 x i1>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfne_mask_vv_nxv4bf16_nxv4bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv4bf16_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmfne.vv v0, v8, v9
+; CHECK-NEXT: vmfne.vv v11, v9, v10, v0.t
+; CHECK-NEXT: vmv.v.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4bf16(
+ <vscale x 4 x bfloat> %1,
+ <vscale x 4 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4bf16(
+ <vscale x 4 x i1> %0,
+ <vscale x 4 x bfloat> %2,
+ <vscale x 4 x bfloat> %3,
+ <vscale x 4 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8bf16(
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfne_vv_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vv_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8bf16(
+ <vscale x 8 x bfloat> %0,
+ <vscale x 8 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8bf16(
+ <vscale x 8 x i1>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>,
+ <vscale x 8 x i1>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfne_mask_vv_nxv8bf16_nxv8bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv8bf16_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v14, v0
+; CHECK-NEXT: vmfne.vv v0, v8, v10
+; CHECK-NEXT: vmfne.vv v14, v10, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v14
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8bf16(
+ <vscale x 8 x bfloat> %1,
+ <vscale x 8 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8bf16(
+ <vscale x 8 x i1> %0,
+ <vscale x 8 x bfloat> %2,
+ <vscale x 8 x bfloat> %3,
+ <vscale x 8 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfne.nxv16bf16(
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfne_vv_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vv_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v8, v12
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfne.nxv16bf16(
+ <vscale x 16 x bfloat> %0,
+ <vscale x 16 x bfloat> %1,
+ iXLen %2)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16bf16(
+ <vscale x 16 x i1>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x bfloat>,
+ <vscale x 16 x i1>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfne_mask_vv_nxv16bf16_nxv16bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv16bf16_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v20, v0
+; CHECK-NEXT: vmfne.vv v0, v8, v12
+; CHECK-NEXT: vmfne.vv v20, v12, v16, v0.t
+; CHECK-NEXT: vmv1r.v v0, v20
+; CHECK-NEXT: ret
+entry:
+ %mask = call <vscale x 16 x i1> @llvm.riscv.vmfne.nxv16bf16(
+ <vscale x 16 x bfloat> %1,
+ <vscale x 16 x bfloat> %2,
+ iXLen %4)
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16bf16(
+ <vscale x 16 x i1> %0,
+ <vscale x 16 x bfloat> %2,
+ <vscale x 16 x bfloat> %3,
+ <vscale x 16 x i1> %mask,
+ iXLen %4)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfne.nxv1bf16.bf16(
+ <vscale x 1 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfne_vf_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vf_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT: vmfne.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfne.nxv1bf16.bf16(
+ <vscale x 1 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1bf16.bf16(
+ <vscale x 1 x i1>,
+ <vscale x 1 x bfloat>,
+ bfloat,
+ <vscale x 1 x i1>,
+ iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfne_mask_vf_nxv1bf16_bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vf_nxv1bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfne.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1bf16.bf16(
+ <vscale x 1 x i1> %0,
+ <vscale x 1 x bfloat> %1,
+ bfloat %2,
+ <vscale x 1 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfne.nxv2bf16.bf16(
+ <vscale x 2 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfne_vf_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vf_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT: vmfne.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfne.nxv2bf16.bf16(
+ <vscale x 2 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2bf16.bf16(
+ <vscale x 2 x i1>,
+ <vscale x 2 x bfloat>,
+ bfloat,
+ <vscale x 2 x i1>,
+ iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfne_mask_vf_nxv2bf16_bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vf_nxv2bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfne.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2bf16.bf16(
+ <vscale x 2 x i1> %0,
+ <vscale x 2 x bfloat> %1,
+ bfloat %2,
+ <vscale x 2 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4bf16.bf16(
+ <vscale x 4 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfne_vf_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vf_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT: vmfne.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4bf16.bf16(
+ <vscale x 4 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4bf16.bf16(
+ <vscale x 4 x i1>,
+ <vscale x 4 x bfloat>,
+ bfloat,
+ <vscale x 4 x i1>,
+ iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfne_mask_vf_nxv4bf16_bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vf_nxv4bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfne.vf v10, v8, fa0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4bf16.bf16(
+ <vscale x 4 x i1> %0,
+ <vscale x 4 x bfloat> %1,
+ bfloat %2,
+ <vscale x 4 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8bf16.bf16(
+ <vscale x 8 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfne_vf_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vf_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT: vmfne.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8bf16.bf16(
+ <vscale x 8 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8bf16.bf16(
+ <vscale x 8 x i1>,
+ <vscale x 8 x bfloat>,
+ bfloat,
+ <vscale x 8 x i1>,
+ iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfne_mask_vf_nxv8bf16_bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vf_nxv8bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmfne.vf v11, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8bf16.bf16(
+ <vscale x 8 x i1> %0,
+ <vscale x 8 x bfloat> %1,
+ bfloat %2,
+ <vscale x 8 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfne.nxv16bf16.bf16(
+ <vscale x 16 x bfloat>,
+ bfloat,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfne_vf_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vf_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT: vmfne.vf v0, v8, fa0
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfne.nxv16bf16.bf16(
+ <vscale x 16 x bfloat> %0,
+ bfloat %1,
+ iXLen %2)
+
+ ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16bf16.bf16(
+ <vscale x 16 x i1>,
+ <vscale x 16 x bfloat>,
+ bfloat,
+ <vscale x 16 x i1>,
+ iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfne_mask_vf_nxv16bf16_bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vf_nxv16bf16_bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v13, v0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmfne.vf v13, v8, fa0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v13
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16bf16.bf16(
+ <vscale x 16 x i1> %0,
+ <vscale x 16 x bfloat> %1,
+ bfloat %2,
+ <vscale x 16 x i1> %3,
+ iXLen %4)
+
+ ret <vscale x 16 x i1> %a
+}
+
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_maximal_reconvergence/enable-maximal-reconvergence.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_maximal_reconvergence/enable-maximal-reconvergence.ll
new file mode 100644
index 0000000..105f4a4
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_maximal_reconvergence/enable-maximal-reconvergence.ll
@@ -0,0 +1,21 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-unknown-vulkan1.3-compute --spirv-ext=+SPV_KHR_maximal_reconvergence %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute --spirv-ext=+SPV_KHR_maximal_reconvergence %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpCapability Shader
+; CHECK: OpExtension "SPV_KHR_maximal_reconvergence"
+; CHECK-NOT: OpExecutionMode {{.*}} MaximallyReconvergesKHR
+; CHECK: OpExecutionMode [[main:%[0-9]+]] MaximallyReconvergesKHR
+; CHECK-NOT: OpExecutionMode {{.*}} MaximallyReconvergesKHR
+; CHECK: OpName [[main]] "main"
+define void @main() local_unnamed_addr #0 {
+entry:
+ ret void
+}
+
+define void @negative() local_unnamed_addr #1 {
+entry:
+ ret void
+}
+
+attributes #0 = { "enable-maximal-reconvergence"="true" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+attributes #1 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWBufferDynamicIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWBufferDynamicIdx.ll
new file mode 100644
index 0000000..cce1eda
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWBufferDynamicIdx.ll
@@ -0,0 +1,22 @@
+; RUN: llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - | FileCheck %s --match-full-lines
+
+%"__cblayout_$Globals" = type <{ i32 }>
+
+@i = external hidden local_unnamed_addr addrspace(12) global i32, align 4
+@ReadWriteBuf.str = private unnamed_addr constant [13 x i8] c"ReadWriteBuf\00", align 1
+@"$Globals.cb" = local_unnamed_addr global target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) poison
+@"$Globals.str" = private unnamed_addr constant [9 x i8] c"$Globals\00", align 1
+
+; CHECK: OpCapability Shader
+; CHECK: OpCapability StorageTexelBufferArrayDynamicIndexingEXT
+
+define void @main() local_unnamed_addr #0 {
+entry:
+ %"$Globals.cb_h.i.i" = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) @"llvm.spv.resource.handlefromimplicitbinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_$Globalss_4_0t_2_0t"(i32 1, i32 0, i32 1, i32 0, ptr nonnull @"$Globals.str")
+ store target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) %"$Globals.cb_h.i.i", ptr @"$Globals.cb", align 8
+ %0 = load i32, ptr addrspace(12) @i, align 4
+ %1 = tail call target("spirv.Image", i32, 5, 2, 0, 0, 2, 33) @llvm.spv.resource.handlefromimplicitbinding.tspirv.Image_i32_5_2_0_0_2_33t(i32 0, i32 0, i32 64, i32 %0, ptr nonnull @ReadWriteBuf.str)
+ %2 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.Image_i32_5_2_0_0_2_33t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 33) %1, i32 98)
+ store i32 99, ptr addrspace(11) %2, align 4
+ ret void
+} \ No newline at end of file
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWStructuredBufferDynamicIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWStructuredBufferDynamicIdx.ll
new file mode 100644
index 0000000..da69a2f
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWStructuredBufferDynamicIdx.ll
@@ -0,0 +1,21 @@
+; RUN: llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - | FileCheck %s --match-full-lines
+
+%"__cblayout_$Globals" = type <{ i32 }>
+
+@i = external hidden local_unnamed_addr addrspace(12) global i32, align 4
+@ReadWriteStructuredBuf.str = private unnamed_addr constant [23 x i8] c"ReadWriteStructuredBuf\00", align 1
+@"$Globals.cb" = local_unnamed_addr global target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) poison
+@"$Globals.str" = private unnamed_addr constant [9 x i8] c"$Globals\00", align 1
+
+; CHECK: OpCapability Shader
+; CHECK: OpCapability StorageBufferArrayDynamicIndexing
+define void @main() local_unnamed_addr #0 {
+entry:
+ %"$Globals.cb_h.i.i" = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) @"llvm.spv.resource.handlefromimplicitbinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_$Globalss_4_0t_2_0t"(i32 2, i32 0, i32 1, i32 0, ptr nonnull @"$Globals.str")
+ store target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) %"$Globals.cb_h.i.i", ptr @"$Globals.cb", align 8
+ %0 = load i32, ptr addrspace(12) @i, align 4
+ %1 = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefromimplicitbinding.tspirv.VulkanBuffer_a0i32_12_1t(i32 0, i32 0, i32 64, i32 %0, ptr nonnull @ReadWriteStructuredBuf.str)
+ %2 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0i32_12_1t(target("spirv.VulkanBuffer", [0 x i32], 12, 1) %1, i32 99)
+ store i32 98, ptr addrspace(11) %2, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/StructuredBufferNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWBufferNonUniformIdx.ll
index 92efad9..92efad9 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/StructuredBufferNonUniformIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWBufferNonUniformIdx.ll
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWStructuredBufferNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWStructuredBufferNonUniformIdx.ll
index 2a12baf..a820e7a 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWStructuredBufferNonUniformIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWStructuredBufferNonUniformIdx.ll
@@ -3,6 +3,7 @@
; CHECK-DAG: OpCapability Shader
; CHECK-DAG: OpCapability ShaderNonUniformEXT
+; CHECK-DAG: OpCapability StorageBufferArrayNonUniformIndexingEXT
; CHECK-DAG: OpDecorate {{%[0-9]+}} NonUniformEXT
; CHECK-DAG: OpDecorate {{%[0-9]+}} NonUniformEXT
; CHECK-DAG: OpDecorate {{%[0-9]+}} NonUniformEXT
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageConstIdx.ll
index d002097..e4ec231 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageConstIdx.ll
@@ -4,8 +4,8 @@
@.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1
; CHECK-DAG: OpCapability Shader
-; CHECK-DAG: OpCapability StorageImageArrayDynamicIndexing
; CHECK-DAG: OpCapability Image1D
+; CHECK-DAG: OpCapability Int8
; CHECK-NOT: OpCapability
; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3
diff --git a/llvm/test/CodeGen/X86/avx-shift.ll b/llvm/test/CodeGen/X86/avx-shift.ll
index c9c09d7..3bce843 100644
--- a/llvm/test/CodeGen/X86/avx-shift.ll
+++ b/llvm/test/CodeGen/X86/avx-shift.ll
@@ -201,7 +201,7 @@ define <8 x i32> @vshift08_add(<8 x i32> %a, <8 x i32> %y) {
define <4 x i32> @vshift13(<4 x i32> %in) {
; CHECK-LABEL: vshift13:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,16]
; CHECK-NEXT: retq
%T = shl <4 x i32> %in, <i32 0, i32 1, i32 2, i32 4>
ret <4 x i32> %T
diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
index 70b3b99..1133cdfd 100644
--- a/llvm/test/CodeGen/X86/avx2-arith.ll
+++ b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -199,12 +199,12 @@ define <8 x i32> @mul_const5(<8 x i32> %x) {
define <8 x i32> @mul_const6(<8 x i32> %x) {
; X86-LABEL: mul_const6:
; X86: # %bb.0:
-; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [0,0,0,2,0,2,0,0]
; X86-NEXT: retl
;
; X64-LABEL: mul_const6:
; X64: # %bb.0:
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,0,0,2,0,2,0,0]
; X64-NEXT: retq
%y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 2, i32 0, i32 2, i32 0, i32 0>
ret <8 x i32> %y
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index ae4d24f..29c41ca 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -66,7 +66,7 @@ define <4 x i32> @combine_vec_mul_pow2a(<4 x i32> %x) {
define <4 x i32> @combine_vec_mul_pow2b(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_pow2b:
; SSE: # %bb.0:
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,16]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_pow2b:
@@ -120,12 +120,12 @@ define <4 x i32> @combine_vec_mul_negpow2a(<4 x i32> %x) {
define <4 x i32> @combine_vec_mul_negpow2b(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_negpow2b:
; SSE: # %bb.0:
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4294967295,4294967294,4294967292,4294967280]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_negpow2b:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4294967295,4294967294,4294967292,4294967280]
; AVX-NEXT: retq
%1 = mul <4 x i32> %x, <i32 -1, i32 -2, i32 -4, i32 -16>
ret <4 x i32> %1
@@ -176,12 +176,12 @@ define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) {
define <4 x i32> @combine_vec_mul_shl_const(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_shl_const:
; SSE: # %bb.0:
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,12,1280,458752]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_shl_const:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,12,1280,458752]
; AVX-NEXT: retq
%1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
%2 = mul <4 x i32> %1, <i32 1, i32 3, i32 5, i32 7>
@@ -193,7 +193,7 @@ define <4 x i32> @combine_vec_mul_shl_oneuse0(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_mul_shl_oneuse0:
; SSE: # %bb.0:
; SSE-NEXT: pmulld %xmm1, %xmm0
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,256,65536]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_shl_oneuse0:
@@ -210,7 +210,7 @@ define <4 x i32> @combine_vec_mul_shl_oneuse1(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_mul_shl_oneuse1:
; SSE: # %bb.0:
; SSE-NEXT: pmulld %xmm1, %xmm0
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,256,65536]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_shl_oneuse1:
@@ -226,7 +226,7 @@ define <4 x i32> @combine_vec_mul_shl_oneuse1(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @combine_vec_mul_shl_multiuse0(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_mul_shl_multiuse0:
; SSE: # %bb.0:
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,256,65536]
; SSE-NEXT: pmulld %xmm0, %xmm1
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: retq
@@ -246,7 +246,7 @@ define <4 x i32> @combine_vec_mul_shl_multiuse0(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @combine_vec_mul_shl_multiuse1(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_mul_shl_multiuse1:
; SSE: # %bb.0:
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,256,65536]
; SSE-NEXT: pmulld %xmm0, %xmm1
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: retq
@@ -268,13 +268,13 @@ define <4 x i32> @combine_vec_mul_shl_multiuse1(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @combine_vec_mul_add(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_add:
; SSE: # %bb.0:
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,6,2,0]
; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_add:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,6,2,0]
; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = add <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
diff --git a/llvm/test/CodeGen/X86/combine-multiplies.ll b/llvm/test/CodeGen/X86/combine-multiplies.ll
index a5d9846..4bdf20d 100644
--- a/llvm/test/CodeGen/X86/combine-multiplies.ll
+++ b/llvm/test/CodeGen/X86/combine-multiplies.ll
@@ -142,9 +142,9 @@ define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind {
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,22,33,44]
; CHECK-NEXT: paddd %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [22,33,44,55]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [33,u,55,u]
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420]
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index 70335f8..ff5329c 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -204,16 +204,16 @@ define i32 @PR43159(ptr %a0) {
; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [344322273,344322273,1916962805,1916962805]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrld $1, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7]
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,2147483648,2147483648]
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: psrld $7, %xmm0
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1645975491,344322273,2164392969,1916962805]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE-NEXT: psrld $6, %xmm1
; SSE-NEXT: movd %xmm1, %edi
@@ -226,15 +226,15 @@ define i32 @PR43159(ptr %a0) {
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,344322273,1916962805,1916962805]
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpsrld $7, %xmm1, %xmm1
; AVX1-NEXT: vpsrld $1, %xmm0, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1645975491,344322273,2164392969,1916962805]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX1-NEXT: vpsrld $6, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %edi
@@ -247,9 +247,9 @@ define i32 @PR43159(ptr %a0) {
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,u,1916962805,u]
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1645975491,344322273,2164392969,1916962805]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -270,9 +270,9 @@ define i32 @PR43159(ptr %a0) {
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,u,1916962805,u]
; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1645975491,344322273,2164392969,1916962805]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -293,9 +293,9 @@ define i32 @PR43159(ptr %a0) {
; AVX512DQVL: # %bb.0: # %entry
; AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,u,1916962805,u]
; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1645975491,344322273,2164392969,1916962805]
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX512DQVL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/combine-rotates.ll b/llvm/test/CodeGen/X86/combine-rotates.ll
index 65d74c8..e7152ec 100644
--- a/llvm/test/CodeGen/X86/combine-rotates.ll
+++ b/llvm/test/CodeGen/X86/combine-rotates.ll
@@ -10,9 +10,9 @@ define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_rot_rot:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [524288,131072,32768,8192]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [131072,u,8192,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 6bcbfe1..f7baee9 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -2927,7 +2927,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,0,0,0,0,0,0,37632]
+; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: packuswb %xmm3, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
@@ -2947,7 +2947,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,37632]
+; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm2, %xmm1
; SSE41-NEXT: paddb %xmm0, %xmm1
@@ -2971,7 +2971,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1
@@ -3044,7 +3044,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,37632]
+; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15]
; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll
index 1ce10c37..9548967 100644
--- a/llvm/test/CodeGen/X86/combine-shl.ll
+++ b/llvm/test/CodeGen/X86/combine-shl.ll
@@ -88,7 +88,7 @@ define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) {
; SSE2-NEXT: pmuludq %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,u,8192,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: movdqa %xmm1, %xmm0
@@ -97,7 +97,7 @@ define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) {
; SSE41-LABEL: combine_vec_shl_known_zero1:
; SSE41: # %bb.0:
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65536,32768,16384,8192]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_known_zero1:
@@ -198,16 +198,16 @@ define <4 x i32> @combine_vec_shl_shl1(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_shl_shl1:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,64,256,1024]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,u,1024,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_shl1:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,64,256,1024]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_shl1:
@@ -304,17 +304,17 @@ define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [131072,524288,2097152,8388608]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [524288,u,8388608,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [33554432,134217728,536870912,2147483648]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [134217728,u,2147483648,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: movdqa %xmm2, %xmm0
@@ -323,10 +323,10 @@ define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) {
; SSE41-LABEL: combine_vec_shl_ext_shl2:
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxwd %xmm0, %xmm2
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [131072,524288,2097152,8388608]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [33554432,134217728,536870912,2147483648]
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
@@ -673,9 +673,9 @@ define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_shl_add1:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4,u,16,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -683,7 +683,7 @@ define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) {
;
; SSE41-LABEL: combine_vec_shl_add1:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16]
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: retq
;
@@ -726,9 +726,9 @@ define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_shl_or1:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4,u,16,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -736,7 +736,7 @@ define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
;
; SSE41-LABEL: combine_vec_shl_or1:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16]
; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: retq
;
@@ -765,7 +765,7 @@ define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
;
; SSE41-LABEL: combine_vec_shl_mul0:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [20,20,20,20]
; SSE41-NEXT: retq
;
; AVX2-LABEL: combine_vec_shl_mul0:
@@ -787,21 +787,21 @@ define <4 x i32> @combine_vec_shl_mul1(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_shl_mul1:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [10,24,56,128]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [24,u,128,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_mul1:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [10,24,56,128]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_mul1:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [10,24,56,128]
; AVX-NEXT: retq
%1 = mul <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
%2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
@@ -813,9 +813,9 @@ define <4 x i32> @combine_vec_add_shl_nonsplat(<4 x i32> %a0) {
; SSE2-LABEL: combine_vec_add_shl_nonsplat:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,8,16,32]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [8,u,32,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -823,7 +823,7 @@ define <4 x i32> @combine_vec_add_shl_nonsplat(<4 x i32> %a0) {
;
; SSE41-LABEL: combine_vec_add_shl_nonsplat:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,8,16,32]
; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: retq
;
@@ -852,7 +852,7 @@ define <4 x i32> @combine_vec_add_shl_and_nonsplat(<4 x i32> %a0) {
; SSE2-NEXT: pmuludq %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [8,u,32,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -863,7 +863,7 @@ define <4 x i32> @combine_vec_add_shl_and_nonsplat(<4 x i32> %a0) {
; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,8,16,32]
; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll
index 4b01c16..0ca79ad 100644
--- a/llvm/test/CodeGen/X86/combine-srem.ll
+++ b/llvm/test/CodeGen/X86/combine-srem.ll
@@ -272,7 +272,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b(<4 x i32> %x) {
; SSE-NEXT: psrad $2, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,2,4,8]
; SSE-NEXT: psubd %xmm2, %xmm0
; SSE-NEXT: retq
;
@@ -291,7 +291,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b(<4 x i32> %x) {
; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,2,4,8]
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -336,7 +336,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
; SSE-NEXT: psrld $1, %xmm1
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967294,4294967292,4294967288,4294967280]
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: retq
;
@@ -358,7 +358,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [4294967294,4294967292,4294967288,4294967280]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -368,7 +368,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [4294967294,4294967292,4294967288,4294967280]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%1 = srem <4 x i32> %x, <i32 -2, i32 -4, i32 -8, i32 -16>
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index c90344b8..233735d 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -665,14 +665,12 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
;
; XOP-LABEL: combine_vec_udiv_nonuniform4:
; XOP: # %bb.0:
-; XOP-NEXT: movl $171, %eax
+; XOP-NEXT: movl $249, %eax
; XOP-NEXT: vmovd %eax, %xmm1
; XOP-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; XOP-NEXT: vpmullw %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpsrlw $8, %xmm1, %xmm1
-; XOP-NEXT: movl $249, %eax
-; XOP-NEXT: vmovd %eax, %xmm2
-; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1
+; XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [171,0,0,0]
+; XOP-NEXT: vpsrlw $8, %xmm2, %xmm2
+; XOP-NEXT: vpshlb %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; XOP-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/combine-umax.ll b/llvm/test/CodeGen/X86/combine-umax.ll
index 25f8ec8..482b4fc 100644
--- a/llvm/test/CodeGen/X86/combine-umax.ll
+++ b/llvm/test/CodeGen/X86/combine-umax.ll
@@ -60,7 +60,7 @@ define <16 x i8> @test_v16i8_reassociation(<16 x i8> %a) {
define <16 x i8> @test_v16i8_demandedbits(<16 x i8> %x, <16 x i8> %y, <16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: test_v16i8_demandedbits:
; SSE2: # %bb.0:
-; SSE2-NEXT: pmaxub %xmm1, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm3
diff --git a/llvm/test/CodeGen/X86/combine-umin.ll b/llvm/test/CodeGen/X86/combine-umin.ll
index 76dbcb5..e2757d0 100644
--- a/llvm/test/CodeGen/X86/combine-umin.ll
+++ b/llvm/test/CodeGen/X86/combine-umin.ll
@@ -77,7 +77,7 @@ define <16 x i8> @test_v16i8_reassociation(<16 x i8> %a) {
define <16 x i8> @test_v16i8_demandedbits(<16 x i8> %x, <16 x i8> %y, <16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: test_v16i8_demandedbits:
; SSE2: # %bb.0:
-; SSE2-NEXT: pminub %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm3
diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll
index 715d5c7..34c7d3d 100644
--- a/llvm/test/CodeGen/X86/combine-urem.ll
+++ b/llvm/test/CodeGen/X86/combine-urem.ll
@@ -327,7 +327,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
; SSE-NEXT: pslld $23, %xmm1
; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,4,8,16]
; SSE-NEXT: pcmpeqd %xmm2, %xmm2
; SSE-NEXT: paddd %xmm1, %xmm2
; SSE-NEXT: pand %xmm2, %xmm0
@@ -338,7 +338,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,4,8,16]
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/dagcombine-shifts.ll b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
index 345b2b9..19b9452 100644
--- a/llvm/test/CodeGen/X86/dagcombine-shifts.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
@@ -437,9 +437,9 @@ define <4 x i32> @shift_zext_shl2_vec(<4 x i8> %x) nounwind {
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [512,256,128,64]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,u,64,u]
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index df97f49..252cb33 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -574,9 +574,9 @@ define <4 x i32> @fshl_v4i32_undef1_cst(<4 x i32> %a0) nounwind {
; X86-SSE2-LABEL: fshl_v4i32_undef1_cst:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [512,1024,2048,4096]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1024,u,4096,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
@@ -746,9 +746,9 @@ define <4 x i32> @fshr_v4i32_undef1_cst(<4 x i32> %a0) nounwind {
; X86-SSE2-LABEL: fshr_v4i32_undef1_cst:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [8388608,4194304,2097152,1048576]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [4194304,u,1048576,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index 1a2aac6..b45d01e 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -499,9 +499,9 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,1,16776960,2147483648]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [1,u,2147483648,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X86-SSE2-NEXT: pand %xmm1, %xmm0
@@ -524,9 +524,9 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,16776960,2147483648]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,u,2147483648,u]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-SSE2-NEXT: pand %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll
index e183bbc..019bca7 100644
--- a/llvm/test/CodeGen/X86/known-pow2.ll
+++ b/llvm/test/CodeGen/X86/known-pow2.ll
@@ -28,16 +28,16 @@ define <4 x i32> @pow2_non_splat_vec_fail0(<4 x i32> %x) {
; CHECK-NEXT: pmuludq %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1073741824,u,67108864,u]
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-NEXT: movdqa %xmm1, %xmm3
; CHECK-NEXT: psrld $1, %xmm3
; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
-; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [9,4,16,64]
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [4,u,64,u]
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-NEXT: psubd %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index 4ec54d8..2a2a4a5 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -2057,10 +2057,10 @@ define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294934528,0,0,0]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,7,0,42,0,32,0]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32768,4294934528,0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
; SSE2-NEXT: paddd %xmm2, %xmm1
@@ -2072,14 +2072,14 @@ define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,7,42,32]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32768,4294934528,0,0]
; AVX1-NEXT: vphaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX256-LABEL: pmaddwd_negative2:
; AVX256: # %bb.0:
; AVX256-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX256-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX256-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [32768,4294934528,0,0,1,7,42,32]
; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
index 693d199..9729fd7 100644
--- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
+++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
@@ -100,7 +100,7 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) {
; SSE4-LABEL: p4_vector_urem_by_const__splat:
; SSE4: # %bb.0:
; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; SSE4-NEXT: psrld $1, %xmm0
; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [715827883,715827883,715827883,715827883]
; SSE4-NEXT: pcmpgtd %xmm0, %xmm1
@@ -128,10 +128,10 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
; SSE2: # %bb.0:
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,u,954437177,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,3435973837,2863311531,954437177]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2147483648,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: psrlq $32, %xmm0
@@ -145,7 +145,7 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
; SSE4-LABEL: p5_vector_urem_by_const__nonsplat:
; SSE4: # %bb.0:
; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,3435973837,2863311531,954437177]
; SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = [1,2147483648]
; SSE4-NEXT: pmuludq %xmm0, %xmm1
; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -159,7 +159,7 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
; AVX2-LABEL: p5_vector_urem_by_const__nonsplat:
; AVX2: # %bb.0:
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,3435973837,2863311531,954437177]
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -196,7 +196,7 @@ define <4 x i1> @p6_vector_urem_by_const__nonsplat_undef0(<4 x i32> %x, <4 x i32
; SSE4-LABEL: p6_vector_urem_by_const__nonsplat_undef0:
; SSE4: # %bb.0:
; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; SSE4-NEXT: movdqa %xmm0, %xmm1
; SSE4-NEXT: psrld $1, %xmm1
; SSE4-NEXT: pslld $31, %xmm0
@@ -312,7 +312,7 @@ define <4 x i1> @p8_vector_urem_by_const__nonsplat_undef3(<4 x i32> %x, <4 x i32
; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE4-NEXT: psrld $2, %xmm2
-; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [6,6,6,6]
; SSE4-NEXT: psubd %xmm2, %xmm0
; SSE4-NEXT: pxor %xmm1, %xmm1
; SSE4-NEXT: pcmpeqd %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index 9aee2f1..00731fe 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -91,7 +91,7 @@ define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind {
;
; SSE41-LABEL: mul_v4i32c:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [117,117,117,117]
; SSE41-NEXT: retq
;
; AVX-LABEL: mul_v4i32c:
diff --git a/llvm/test/CodeGen/X86/pr162812.ll b/llvm/test/CodeGen/X86/pr162812.ll
index 4ea3101..cec093c 100644
--- a/llvm/test/CodeGen/X86/pr162812.ll
+++ b/llvm/test/CodeGen/X86/pr162812.ll
@@ -34,61 +34,43 @@ define <32 x i8> @PR162812(<32 x i8> %a, <32 x i8> %mask) {
;
; SSE42-LABEL: PR162812:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa %xmm2, %xmm5
-; SSE42-NEXT: movdqa %xmm0, %xmm2
+; SSE42-NEXT: movdqa %xmm0, %xmm4
+; SSE42-NEXT: psrlw $2, %xmm2
+; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [8224,8224,8224,8224,8224,8224,8224,8224]
+; SSE42-NEXT: pand %xmm5, %xmm2
+; SSE42-NEXT: paddb %xmm2, %xmm2
+; SSE42-NEXT: paddb %xmm2, %xmm2
; SSE42-NEXT: movdqa %xmm0, %xmm6
-; SSE42-NEXT: psllw $2, %xmm6
-; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; SSE42-NEXT: pand %xmm7, %xmm6
-; SSE42-NEXT: psrlw $2, %xmm5
-; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [8224,8224,8224,8224,8224,8224,8224,8224]
-; SSE42-NEXT: pand %xmm4, %xmm5
+; SSE42-NEXT: paddb %xmm0, %xmm6
+; SSE42-NEXT: movdqa %xmm2, %xmm0
+; SSE42-NEXT: pblendvb %xmm0, %xmm6, %xmm4
+; SSE42-NEXT: psrlw $2, %xmm3
+; SSE42-NEXT: pand %xmm3, %xmm5
; SSE42-NEXT: paddb %xmm5, %xmm5
-; SSE42-NEXT: movdqa %xmm5, %xmm0
-; SSE42-NEXT: pblendvb %xmm0, %xmm6, %xmm2
-; SSE42-NEXT: movdqa %xmm2, %xmm6
-; SSE42-NEXT: paddb %xmm2, %xmm6
; SSE42-NEXT: paddb %xmm5, %xmm5
+; SSE42-NEXT: movdqa %xmm1, %xmm2
+; SSE42-NEXT: paddb %xmm1, %xmm2
; SSE42-NEXT: movdqa %xmm5, %xmm0
-; SSE42-NEXT: pblendvb %xmm0, %xmm6, %xmm2
-; SSE42-NEXT: movdqa %xmm1, %xmm5
-; SSE42-NEXT: psllw $2, %xmm5
-; SSE42-NEXT: pand %xmm7, %xmm5
-; SSE42-NEXT: psrlw $2, %xmm3
-; SSE42-NEXT: pand %xmm3, %xmm4
-; SSE42-NEXT: paddb %xmm4, %xmm4
-; SSE42-NEXT: movdqa %xmm4, %xmm0
-; SSE42-NEXT: pblendvb %xmm0, %xmm5, %xmm1
-; SSE42-NEXT: movdqa %xmm1, %xmm3
-; SSE42-NEXT: paddb %xmm1, %xmm3
-; SSE42-NEXT: paddb %xmm4, %xmm4
+; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE42-NEXT: movdqa %xmm4, %xmm0
-; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm1
-; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: retq
;
; AVX2-LABEL: PR162812:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: PR162812:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllw $2, %ymm0, %ymm2
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512-NEXT: vpaddb %ymm0, %ymm0, %ymm2
; AVX512-NEXT: vpsrlw $2, %ymm1, %ymm1
; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT: vpaddb %ymm0, %ymm0, %ymm2
; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
index 885b075..59b03f8 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
@@ -9,7 +9,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) {
; AVX256BW: # %bb.0:
; AVX256BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX256BW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX256BW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll
index 1ead3f9..7d0ec64 100644
--- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll
@@ -149,19 +149,12 @@ define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind {
; Result would undershift
define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
-; X86-LABEL: no_extract_shl:
-; X86: # %bb.0:
-; X86-NEXT: vpsllq $24, %ymm0, %ymm1
-; X86-NEXT: vpsrlq $39, %ymm0, %ymm0
-; X86-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm1, %ymm0
-; X86-NEXT: retl
-;
-; X64-LABEL: no_extract_shl:
-; X64: # %bb.0:
-; X64-NEXT: vpsllq $24, %ymm0, %ymm1
-; X64-NEXT: vpsrlq $39, %ymm0, %ymm0
-; X64-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: no_extract_shl:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllq $24, %ymm0, %ymm1
+; CHECK-NEXT: vpsrlq $39, %ymm0, %ymm0
+; CHECK-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & m64bcst) | ymm1
+; CHECK-NEXT: ret{{[l|q]}}
%lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11>
%rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24>
%lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50>
@@ -171,19 +164,12 @@ define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
; Result would overshift
define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind {
-; X86-LABEL: no_extract_shrl:
-; X86: # %bb.0:
-; X86-NEXT: vpsrld $9, %xmm0, %xmm1
-; X86-NEXT: vpslld $25, %xmm0, %xmm0
-; X86-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm1, %xmm0
-; X86-NEXT: retl
-;
-; X64-LABEL: no_extract_shrl:
-; X64: # %bb.0:
-; X64-NEXT: vpsrld $9, %xmm0, %xmm1
-; X64-NEXT: vpslld $25, %xmm0, %xmm0
-; X64-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
-; X64-NEXT: retq
+; CHECK-LABEL: no_extract_shrl:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrld $9, %xmm0, %xmm1
+; CHECK-NEXT: vpslld $25, %xmm0, %xmm0
+; CHECK-NEXT: vpternlogd {{.*#+}} xmm0 = (xmm0 & m32bcst) | xmm1
+; CHECK-NEXT: ret{{[l|q]}}
%lhs_div = lshr <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
%rhs_div = lshr <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
%lhs_shift = shl <4 x i32> %lhs_div, <i32 28, i32 28, i32 28, i32 28>
diff --git a/llvm/test/CodeGen/X86/sdiv-exact.ll b/llvm/test/CodeGen/X86/sdiv-exact.ll
index 4568191..7873ffa 100644
--- a/llvm/test/CodeGen/X86/sdiv-exact.ll
+++ b/llvm/test/CodeGen/X86/sdiv-exact.ll
@@ -87,7 +87,7 @@ define <4 x i32> @test5(<4 x i32> %x) {
; X86-NEXT: pmuludq %xmm1, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [2863311531,u,3264175145,u]
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
@@ -95,7 +95,7 @@ define <4 x i32> @test5(<4 x i32> %x) {
; X64-LABEL: test5:
; X64: # %bb.0:
; X64-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,3264175145,3264175145]
; X64-NEXT: retq
%div = sdiv exact <4 x i32> %x, <i32 24, i32 24, i32 25, i32 25>
ret <4 x i32> %div
@@ -112,7 +112,7 @@ define <4 x i32> @test6(<4 x i32> %x) {
; X86-NEXT: pmuludq %xmm0, %xmm1
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2863311531,u,3303820997,u]
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X86-NEXT: movdqa %xmm1, %xmm0
@@ -121,7 +121,7 @@ define <4 x i32> @test6(<4 x i32> %x) {
; X64-LABEL: test6:
; X64: # %bb.0:
; X64-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,3303820997,3303820997]
; X64-NEXT: retq
%div = sdiv exact <4 x i32> %x, <i32 24, i32 24, i32 26, i32 26>
ret <4 x i32> %div
@@ -131,16 +131,16 @@ define <4 x i32> @test7(<4 x i32> %x) {
; X86-LABEL: test7:
; X86: # %bb.0:
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [3264175145,3264175145,1749801491,1749801491]
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [3264175145,u,1749801491,u]
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
;
; X64-LABEL: test7:
; X64: # %bb.0:
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,1749801491,1749801491]
; X64-NEXT: retq
%div = sdiv exact <4 x i32> %x, <i32 25, i32 25, i32 27, i32 27>
ret <4 x i32> %div
@@ -156,7 +156,7 @@ define <4 x i32> @test8(<4 x i32> %x) {
; X86-NEXT: pmuludq %xmm1, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1,u,2863311531,u]
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
@@ -164,7 +164,7 @@ define <4 x i32> @test8(<4 x i32> %x) {
; X64-LABEL: test8:
; X64: # %bb.0:
; X64-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,1,2863311531,2863311531]
; X64-NEXT: retq
%div = sdiv exact <4 x i32> %x, <i32 1, i32 1, i32 24, i32 24>
ret <4 x i32> %div
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index e53eed4..504a392 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1760,7 +1760,7 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,65535,u,u]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1781,7 +1781,7 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
; X64-AVX-NEXT: movq c(%rip), %rax
; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,65535,u,u]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
@@ -1864,7 +1864,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65536,65536,65536,65536]
; X86-SSE-NEXT: psllq $32, %xmm0
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
@@ -1876,7 +1876,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,65536,u,u]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1885,7 +1885,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
; X64-SSE-NEXT: movq c(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65536,65536,65536,65536]
; X64-SSE-NEXT: psllq $32, %xmm0
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
@@ -1895,7 +1895,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
; X64-AVX-NEXT: movq c(%rip), %rax
; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,65536,u,u]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
@@ -1922,7 +1922,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: psrad $16, %xmm0
-; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,32768,32768,32768]
; X86-SSE-NEXT: psllq $32, %xmm0
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
@@ -1934,7 +1934,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,32768,u,u]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1943,7 +1943,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
; X64-SSE-NEXT: movq c(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: psrad $16, %xmm0
-; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,32768,32768,32768]
; X64-SSE-NEXT: psllq $32, %xmm0
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
@@ -1953,7 +1953,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
; X64-AVX-NEXT: movq c(%rip), %rax
; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,32768,u,u]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll
index 975ffd0..e8c05f9 100644
--- a/llvm/test/CodeGen/X86/slow-pmulld.ll
+++ b/llvm/test/CodeGen/X86/slow-pmulld.ll
@@ -336,13 +336,13 @@ define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
; SSE4-32-LABEL: test_mul_v4i32_v4i16:
; SSE4-32: # %bb.0:
; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE4-32-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; SSE4-32-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [18778,18778,18778,18778]
; SSE4-32-NEXT: retl
;
; SSE4-64-LABEL: test_mul_v4i32_v4i16:
; SSE4-64: # %bb.0:
; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE4-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [18778,18778,18778,18778]
; SSE4-64-NEXT: retq
;
; AVX2-SLOW-LABEL: test_mul_v4i32_v4i16:
@@ -838,13 +838,13 @@ define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize {
; SSE-32-LABEL: test_mul_v4i32_v4i16_minsize:
; SSE-32: # %bb.0:
; SSE-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE-32-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; SSE-32-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [18778,18778,18778,18778]
; SSE-32-NEXT: retl
;
; SSE-64-LABEL: test_mul_v4i32_v4i16_minsize:
; SSE-64: # %bb.0:
; SSE-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [18778,18778,18778,18778]
; SSE-64-NEXT: retq
;
; AVX2-LABEL: test_mul_v4i32_v4i16_minsize:
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 2d07788..bb7245c 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -10,15 +10,15 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -30,10 +30,10 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_even:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -47,10 +47,10 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_even:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -63,7 +63,7 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_even:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -75,7 +75,7 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_even:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -109,7 +109,7 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_allones_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,4294967295,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
@@ -119,7 +119,7 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_allones_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -168,7 +168,7 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,4294967295,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
@@ -178,7 +178,7 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_allones_ne:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -234,7 +234,7 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_allones_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
@@ -248,7 +248,7 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_allones_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
@@ -308,7 +308,7 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
@@ -322,7 +322,7 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_allones_ne:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
@@ -367,15 +367,15 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_allones_eq:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -387,10 +387,10 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_even_allones_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -404,10 +404,10 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_even_allones_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -420,7 +420,7 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_even_allones_eq:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -432,7 +432,7 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_eq:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -448,15 +448,15 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_allones_ne:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -468,10 +468,10 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_even_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -485,10 +485,10 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_even_allones_ne:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -501,7 +501,7 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_even_allones_ne:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -514,7 +514,7 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_ne:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -534,14 +534,14 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,u,1,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -553,7 +553,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
@@ -568,9 +568,9 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -581,7 +581,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -593,7 +593,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -611,9 +611,9 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,u,1,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -636,11 +636,11 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,268435456,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -654,11 +654,11 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,2147483648,268435456,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -671,7 +671,7 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_even_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -683,7 +683,7 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_even_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -701,9 +701,9 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -711,7 +711,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2147483648,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -725,11 +725,11 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,268435456,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -743,11 +743,11 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2147483648,268435456,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -760,7 +760,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -772,7 +772,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_even_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -806,7 +806,7 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,4294967295,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
@@ -816,7 +816,7 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -871,7 +871,7 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
@@ -885,7 +885,7 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
@@ -929,15 +929,15 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -949,10 +949,10 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_even_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -966,10 +966,10 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_even_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -982,7 +982,7 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_even_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -994,7 +994,7 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_even_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1018,9 +1018,9 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pand %xmm0, %xmm2
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,u,1,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
@@ -1039,7 +1039,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
; CHECK-SSE41-NEXT: pand %xmm0, %xmm2
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,1,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
@@ -1053,7 +1053,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
@@ -1067,7 +1067,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
; CHECK-AVX2-NEXT: vpand %xmm2, %xmm0, %xmm2
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
@@ -1080,7 +1080,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
@@ -1102,7 +1102,7 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1137,8 +1137,8 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [2147483648,2147483648,2,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
@@ -1156,11 +1156,11 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: test_srem_even_INT_MIN:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [2147483648,2147483648,2,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
@@ -1177,7 +1177,7 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX2-LABEL: test_srem_even_INT_MIN:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
@@ -1196,7 +1196,7 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
@@ -1219,7 +1219,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1227,7 +1227,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
@@ -1253,8 +1253,8 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2147483648,u,1073741824,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,2147483648,2,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
@@ -1272,11 +1272,11 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: test_srem_odd_even_INT_MIN:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [2147483648,u,1073741824,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,2147483648,2,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
@@ -1293,7 +1293,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX2-LABEL: test_srem_odd_even_INT_MIN:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
@@ -1312,7 +1312,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
@@ -1333,14 +1333,14 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,u,3435973837,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -1352,7 +1352,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
@@ -1367,9 +1367,9 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1380,7 +1380,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3435973837]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1392,7 +1392,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3435973837]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1410,9 +1410,9 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,0,1,3067833783]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,u,3067833783,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1420,7 +1420,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [1,u,2147483648,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -1434,11 +1434,11 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,0,1,3067833783]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,268435456,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1452,11 +1452,11 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,2147483648,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,1,268435456,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1469,7 +1469,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,3067833783]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1481,7 +1481,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,3067833783]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1499,9 +1499,9 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1509,7 +1509,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [1,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -1523,11 +1523,11 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
;
; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,1073741824,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1541,11 +1541,11 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
;
; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,1073741824,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,1,268435456,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1558,7 +1558,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
;
; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1570,7 +1570,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
;
; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_and_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1604,7 +1604,7 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,4294967295,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
@@ -1614,7 +1614,7 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_allones_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1669,7 +1669,7 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
@@ -1683,7 +1683,7 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_allones_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
@@ -1727,15 +1727,15 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1747,10 +1747,10 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,0,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1764,10 +1764,10 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,0,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,1,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1780,7 +1780,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,0,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1792,7 +1792,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,0,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1812,15 +1812,15 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3435973837,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,268435456,1,1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1832,10 +1832,10 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1,1]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1849,10 +1849,10 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,268435456,1,1]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1865,7 +1865,7 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1877,7 +1877,7 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1895,9 +1895,9 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,1,0,3067833783]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3067833783,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1905,7 +1905,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [268435456,u,2147483648,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -1919,11 +1919,11 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,1,0,3067833783]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,u,2147483648,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,268435456,1,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1937,11 +1937,11 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,u,2147483648,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,268435456,1,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1954,7 +1954,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_even_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1966,7 +1966,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_even_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1984,15 +1984,15 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,268435456,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2004,10 +2004,10 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -2021,10 +2021,10 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,268435456,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -2037,7 +2037,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -2049,7 +2049,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_even_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -2067,9 +2067,9 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,u,1,u]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: psrlq $32, %xmm1
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2081,9 +2081,9 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,u,1,u]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm0
@@ -2096,9 +2096,9 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,u,1,u]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
@@ -2110,7 +2110,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,0]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -2122,7 +2122,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,0]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -2138,9 +2138,9 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,u,1,u]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,u,268435456,u]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: psrlq $32, %xmm1
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2152,9 +2152,9 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,u,1,u]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,u,268435456,u]
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm0
@@ -2167,9 +2167,9 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,u,1,u]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,u,268435456,u]
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
@@ -2181,7 +2181,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,0]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -2193,7 +2193,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,0]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -2335,10 +2335,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [34048,34048,26368,37632,21760,33024,22016,35072]
+; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,133,0,133,0,103,0,147,0,85,0,129,0,86,0,137]
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [20224,26368,6912,30976,33024,33024,33024,12032]
+; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47]
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; CHECK-AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm4
; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm5 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0]
@@ -2369,10 +2369,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm4
; CHECK-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [2304,0,10496,37632,33024,33024,21760,36096]
+; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [0,9,0,0,0,41,0,147,0,129,0,129,0,85,0,141]
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 # [22016,24320,37632,11008,12544,32512,16640,37632]
+; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 # [0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147]
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6
; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm7 # [0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0]
@@ -2417,10 +2417,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
-; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [34048,34048,26368,37632,21760,33024,22016,35072,2304,0,10496,37632,33024,33024,21760,36096]
+; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,133,0,133,0,103,0,147,0,85,0,129,0,86,0,137,0,9,0,0,0,41,0,147,0,129,0,129,0,85,0,141]
; CHECK-AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
; CHECK-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
-; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [20224,26368,6912,30976,33024,33024,33024,12032,22016,24320,37632,11008,12544,32512,16640,37632]
+; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47,0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147]
; CHECK-AVX2-NEXT: vpsrlw $8, %ymm4, %ymm4
; CHECK-AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0]
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
index 3359202..d459d01 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
@@ -24,7 +24,7 @@ define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_25:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
@@ -34,7 +34,7 @@ define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_25:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -90,7 +90,7 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_100:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $2, %xmm1
@@ -104,7 +104,7 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_100:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0
@@ -165,7 +165,7 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_neg25:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
@@ -175,7 +175,7 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_neg25:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -231,7 +231,7 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_neg100:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $2, %xmm1
@@ -245,7 +245,7 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_neg100:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0
@@ -333,7 +333,7 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: psrld $31, %xmm1
; CHECK-SSE41-NEXT: psrad $3, %xmm2
; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [25,25,25,25]
; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -351,7 +351,7 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25,25,25,25]
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -444,7 +444,7 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: psrld $31, %xmm1
; CHECK-SSE41-NEXT: psrad $5, %xmm2
; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [100,100,100,100]
; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -462,7 +462,7 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [100,100,100,100]
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/udiv-exact.ll b/llvm/test/CodeGen/X86/udiv-exact.ll
index 271d11e..2b3f26a 100644
--- a/llvm/test/CodeGen/X86/udiv-exact.ll
+++ b/llvm/test/CodeGen/X86/udiv-exact.ll
@@ -87,7 +87,7 @@ define <4 x i32> @test5(<4 x i32> %x) {
; X86-NEXT: pmuludq %xmm1, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [2863311531,u,3264175145,u]
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
@@ -95,7 +95,7 @@ define <4 x i32> @test5(<4 x i32> %x) {
; X64-LABEL: test5:
; X64: # %bb.0:
; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,3264175145,3264175145]
; X64-NEXT: retq
%div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 25, i32 25>
ret <4 x i32> %div
@@ -112,7 +112,7 @@ define <4 x i32> @test6(<4 x i32> %x) {
; X86-NEXT: pmuludq %xmm0, %xmm1
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2863311531,u,3303820997,u]
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X86-NEXT: movdqa %xmm1, %xmm0
@@ -121,7 +121,7 @@ define <4 x i32> @test6(<4 x i32> %x) {
; X64-LABEL: test6:
; X64: # %bb.0:
; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,3303820997,3303820997]
; X64-NEXT: retq
%div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 26, i32 26>
ret <4 x i32> %div
@@ -131,16 +131,16 @@ define <4 x i32> @test7(<4 x i32> %x) {
; X86-LABEL: test7:
; X86: # %bb.0:
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [3264175145,3264175145,1749801491,1749801491]
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [3264175145,u,1749801491,u]
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
;
; X64-LABEL: test7:
; X64: # %bb.0:
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,1749801491,1749801491]
; X64-NEXT: retq
%div = udiv exact <4 x i32> %x, <i32 25, i32 25, i32 27, i32 27>
ret <4 x i32> %div
@@ -156,7 +156,7 @@ define <4 x i32> @test8(<4 x i32> %x) {
; X86-NEXT: pmuludq %xmm1, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1,u,2863311531,u]
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
@@ -164,7 +164,7 @@ define <4 x i32> @test8(<4 x i32> %x) {
; X64-LABEL: test8:
; X64: # %bb.0:
; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,1,2863311531,2863311531]
; X64-NEXT: retq
%div = udiv exact <4 x i32> %x, <i32 1, i32 1, i32 24, i32 24>
ret <4 x i32> %div
diff --git a/llvm/test/CodeGen/X86/undo-mul-and.ll b/llvm/test/CodeGen/X86/undo-mul-and.ll
index c9c40099..6566153 100644
--- a/llvm/test/CodeGen/X86/undo-mul-and.ll
+++ b/llvm/test/CodeGen/X86/undo-mul-and.ll
@@ -63,9 +63,9 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_fail_no_splat(<4 x i32> %x) {
; CHECK-SSE-LABEL: mul_and_to_neg_shl_and_vec_fail_no_splat:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [56,56,56,64]
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [56,u,64,u]
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -73,13 +73,13 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_fail_no_splat(<4 x i32> %x) {
;
; CHECK-AVX1-LABEL: mul_and_to_neg_shl_and_vec_fail_no_splat:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [56,56,56,64]
; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX512-LABEL: mul_and_to_neg_shl_and_vec_fail_no_splat:
; CHECK-AVX512: # %bb.0:
-; CHECK-AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [56,56,56,64]
; CHECK-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; CHECK-AVX512-NEXT: retq
%mul = mul <4 x i32> %x, <i32 56, i32 56, i32 56, i32 64>
@@ -92,9 +92,9 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_todo_no_splat1(<4 x i32> %x) {
; CHECK-SSE-LABEL: mul_and_to_neg_shl_and_vec_todo_no_splat1:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [56,56,56,48]
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [56,u,48,u]
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -102,13 +102,13 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_todo_no_splat1(<4 x i32> %x) {
;
; CHECK-AVX1-LABEL: mul_and_to_neg_shl_and_vec_todo_no_splat1:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [56,56,56,48]
; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX512-LABEL: mul_and_to_neg_shl_and_vec_todo_no_splat1:
; CHECK-AVX512: # %bb.0:
-; CHECK-AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [56,56,56,48]
; CHECK-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; CHECK-AVX512-NEXT: retq
%mul = mul <4 x i32> %x, <i32 56, i32 56, i32 56, i32 48>
@@ -131,7 +131,7 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_todo_no_splat2(<4 x i32> %x) {
;
; CHECK-AVX1-LABEL: mul_and_to_neg_shl_and_vec_todo_no_splat2:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [56,56,56,56]
; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index 874d885..759055d 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -167,7 +167,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; SSE41-NEXT: pinsrd $1, %esi, %xmm0
; SSE41-NEXT: pinsrd $2, %edx, %xmm0
; SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [683,1463,819,u]
; SSE41-NEXT: pmovsxwd {{.*#+}} xmm1 = [2047,2047,2047,2047]
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pand %xmm1, %xmm2
@@ -193,7 +193,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; AVX1-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
; AVX1-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
; AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [683,1463,819,u]
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2047,2047,2047,2047]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
@@ -218,7 +218,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; AVX2-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
; AVX2-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
; AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [683,1463,819,u]
; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2047,2047,2047,2047]
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -240,7 +240,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; AVX512VL-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
; AVX512VL-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
; AVX512VL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [683,1463,819,u]
; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2047,2047,2047,2047]
; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 838086e..2228c09 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -10,10 +10,10 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -26,9 +26,9 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_even:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -42,9 +42,9 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_even:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -57,7 +57,7 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_even:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -68,7 +68,7 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -87,9 +87,9 @@ define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_allones_eq:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,4294967295,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -99,7 +99,7 @@ define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_allones_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,4294967295,3435973837]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,1,858993459]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -108,7 +108,7 @@ define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX-LABEL: test_urem_odd_allones_eq:
; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,4294967295,3435973837]
; CHECK-AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0
@@ -122,9 +122,9 @@ define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_allones_ne:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,4294967295,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -134,7 +134,7 @@ define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,4294967295,3435973837]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993460,858993460,2,858993460]
; CHECK-SSE41-NEXT: pmaxud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -143,7 +143,7 @@ define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX-LABEL: test_urem_odd_allones_ne:
; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,4294967295,3435973837]
; CHECK-AVX-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0
@@ -159,12 +159,12 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_eq:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,4294967295,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -178,10 +178,10 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_allones_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,4294967295,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -195,10 +195,10 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_allones_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,2147483648,1,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -211,7 +211,7 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_even_allones_eq:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -222,7 +222,7 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_even_allones_eq:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -237,12 +237,12 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_ne:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,4294967295,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -256,10 +256,10 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,4294967295,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -273,10 +273,10 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_allones_ne:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,2147483648,1,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -289,7 +289,7 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_even_allones_ne:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -300,7 +300,7 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_even_allones_ne:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -317,10 +317,10 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_allones_eq:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -333,9 +333,9 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_even_allones_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -349,9 +349,9 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_even_allones_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -364,7 +364,7 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_even_allones_eq:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -375,7 +375,7 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_eq:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -390,10 +390,10 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_allones_ne:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -406,9 +406,9 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_even_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -422,9 +422,9 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_even_allones_ne:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -437,7 +437,7 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_even_allones_ne:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -448,7 +448,7 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_ne:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -467,10 +467,10 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -482,7 +482,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -496,8 +496,8 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -508,7 +508,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -519,7 +519,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -536,12 +536,12 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,268435456,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -555,10 +555,10 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,268435456,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -572,10 +572,10 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,2147483648,268435456,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -588,7 +588,7 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_even_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -599,7 +599,7 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_even_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -616,11 +616,11 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,268435456,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -634,10 +634,10 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,268435456,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -651,10 +651,10 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2147483648,268435456,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -667,7 +667,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -678,7 +678,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -710,7 +710,7 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,4294967295,858993459]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -719,7 +719,7 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
@@ -769,7 +769,7 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
; CHECK-SSE41-NEXT: pslld $31, %xmm0
@@ -782,7 +782,7 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -822,10 +822,10 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -838,9 +838,9 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_even_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -854,9 +854,9 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_even_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -869,7 +869,7 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_even_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -880,7 +880,7 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -899,10 +899,10 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -914,7 +914,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,2]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -928,8 +928,8 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_INT_MIN:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,2,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -940,7 +940,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_INT_MIN:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -951,7 +951,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_INT_MIN:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -968,12 +968,12 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,2,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -987,10 +987,10 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_INT_MIN:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,2,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1004,10 +1004,10 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_INT_MIN:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,2147483648,2,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1020,7 +1020,7 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_even_INT_MIN:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1031,7 +1031,7 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_even_INT_MIN:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1048,11 +1048,11 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,2,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1066,10 +1066,10 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_even_INT_MIN:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,2,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1083,10 +1083,10 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_even_INT_MIN:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2147483648,2,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1099,7 +1099,7 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_even_INT_MIN:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1110,7 +1110,7 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_INT_MIN:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1129,10 +1129,10 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3435973837,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3435973837]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -1144,7 +1144,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3435973837]
; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -1158,8 +1158,8 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3435973837]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1170,7 +1170,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3435973837]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1181,7 +1181,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3435973837]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1198,11 +1198,11 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,268435456,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3067833783,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1216,10 +1216,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,268435456,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1233,10 +1233,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,2147483648,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,1,268435456,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1249,7 +1249,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,3067833783]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1260,7 +1260,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,3067833783]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1277,11 +1277,11 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3264175145]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1295,10 +1295,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
;
; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,1073741824,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1312,10 +1312,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
;
; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,1073741824,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,1,268435456,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1328,7 +1328,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
;
; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1339,7 +1339,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1358,9 +1358,9 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_allones_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,0,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3435973837,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1370,7 +1370,7 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,0,3435973837]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,4294967295,858993459]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -1379,7 +1379,7 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX-LABEL: test_urem_odd_allones_and_one:
; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,0,3435973837]
; CHECK-AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0
@@ -1395,11 +1395,11 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,0,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,1,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3067833783,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1413,10 +1413,10 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,0,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,1,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1430,10 +1430,10 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_allones_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,0,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,2147483648,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,1,1,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1446,7 +1446,7 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_even_allones_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,0,3067833783]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1457,7 +1457,7 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,0,3067833783]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1474,10 +1474,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1490,9 +1490,9 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,0,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1506,9 +1506,9 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,0,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,1,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1521,7 +1521,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,0,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1532,7 +1532,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,0,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1551,10 +1551,10 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3435973837,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1,1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1567,9 +1567,9 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3435973837]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1,1]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1583,9 +1583,9 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,268435456,1,1]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1598,7 +1598,7 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1609,7 +1609,7 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1626,11 +1626,11 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,1,0,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,268435456,1,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3067833783,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,u,2147483648,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1644,10 +1644,10 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,1,0,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,u,2147483648,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,268435456,1,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1661,10 +1661,10 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,u,2147483648,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,268435456,1,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1677,7 +1677,7 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_even_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1688,7 +1688,7 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_even_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1705,10 +1705,10 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1721,9 +1721,9 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1737,9 +1737,9 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,268435456,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1752,7 +1752,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1763,7 +1763,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1781,10 +1781,10 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,0,0,0]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,0]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -1796,7 +1796,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,0]
; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -1810,8 +1810,8 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,0]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1822,7 +1822,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,0]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1833,7 +1833,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,0]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1849,10 +1849,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,0,0,0]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,0]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -1864,7 +1864,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,0]
; CHECK-SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -1878,8 +1878,8 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,0]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [2147483648,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1890,7 +1890,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,0]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1901,7 +1901,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,0]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
index 6a36cd2..8042103 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
@@ -25,7 +25,7 @@ define <4 x i1> @t32_3(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: t32_3:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,1431655764,1431655764,1431655764]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -34,7 +34,7 @@ define <4 x i1> @t32_3(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: t32_3:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
@@ -80,7 +80,7 @@ define <4 x i1> @t32_5(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: t32_5:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,858993458,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -89,7 +89,7 @@ define <4 x i1> @t32_5(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: t32_5:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
@@ -140,7 +140,7 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: t32_6_part0:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
; CHECK-SSE41-NEXT: pslld $31, %xmm0
@@ -153,7 +153,7 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: t32_6_part0:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -211,7 +211,7 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: t32_6_part1:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
; CHECK-SSE41-NEXT: pslld $31, %xmm0
@@ -224,7 +224,7 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: t32_6_part1:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -275,7 +275,7 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: t32_tautological:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295,4294967295,1431655764]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -286,7 +286,7 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: t32_tautological:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
index 2166e43..b490c3c 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
@@ -23,7 +23,7 @@ define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_25:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798691,171798691,171798691,171798691]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -32,7 +32,7 @@ define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_25:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
@@ -83,7 +83,7 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_100:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $2, %xmm1
; CHECK-SSE41-NEXT: pslld $30, %xmm0
@@ -96,7 +96,7 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_100:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -139,9 +139,9 @@ define <4 x i32> @test_urem_odd_neg25(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_neg25:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,1030792151,1030792151,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1030792151,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -151,7 +151,7 @@ define <4 x i32> @test_urem_odd_neg25(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_neg25:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,1030792151,1030792151,3264175145]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798691,1,1,171798691]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -160,7 +160,7 @@ define <4 x i32> @test_urem_odd_neg25(<4 x i32> %X) nounwind {
;
; CHECK-AVX-LABEL: test_urem_odd_neg25:
; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,1030792151,1030792151,3264175145]
; CHECK-AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0
@@ -176,9 +176,9 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_neg100:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4252017623,3264175145,4252017623,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
@@ -192,7 +192,7 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_neg100:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4252017623,3264175145,4252017623,3264175145]
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $2, %xmm1
; CHECK-SSE41-NEXT: pslld $30, %xmm0
@@ -205,7 +205,7 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_neg100:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4252017623,3264175145,4252017623,3264175145]
; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -216,7 +216,7 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_even_neg100:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4252017623,3264175145,4252017623,3264175145]
; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -227,7 +227,7 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_even_neg100:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4252017623,3264175145,4252017623,3264175145]
; CHECK-AVX512VL-NEXT: vprord $2, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -277,7 +277,7 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: psrld $3, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [25,25,25,25]
; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -293,7 +293,7 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25,25,25,25]
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -372,7 +372,7 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: psrld $5, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [100,100,100,100]
; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -388,7 +388,7 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [100,100,100,100]
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
index 84856aa..e5b19a5 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
@@ -25,7 +25,7 @@ define <4 x i1> @t0_all_tautological(<4 x i32> %X) nounwind {
define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: t1_all_odd_eq:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -33,7 +33,7 @@ define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: t1_all_odd_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,4294967295,4294967295,4294967295]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -43,7 +43,7 @@ define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: t1_all_odd_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -76,7 +76,7 @@ define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind {
define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: t1_all_odd_ne:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -84,7 +84,7 @@ define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: t1_all_odd_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,4294967295,4294967295,4294967295]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -95,7 +95,7 @@ define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: t1_all_odd_ne:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -187,7 +187,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
; CHECK-SSE2-NEXT: psrlq $32, %xmm3
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311530,2863311530]
; CHECK-SSE2-NEXT: paddq %xmm3, %xmm0
; CHECK-SSE2-NEXT: psllq $32, %xmm0
; CHECK-SSE2-NEXT: paddq %xmm2, %xmm0
@@ -212,7 +212,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm3
; CHECK-SSE41-NEXT: psrlq $32, %xmm3
; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311530,2863311530]
; CHECK-SSE41-NEXT: paddq %xmm3, %xmm0
; CHECK-SSE41-NEXT: psllq $32, %xmm0
; CHECK-SSE41-NEXT: paddq %xmm2, %xmm0
@@ -236,7 +236,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311530,2863311530]
; CHECK-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -255,7 +255,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311530,2863311530]
; CHECK-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 83a0ddb..fce8795 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -241,7 +241,7 @@ define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
;
; SSE41-LABEL: var_shuffle_v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [67372036,67372036,67372036,67372036]
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41-NEXT: pshufb %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -319,7 +319,7 @@ define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounw
; SSE41-NEXT: pmaxud %xmm1, %xmm2
; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [67372036,67372036,67372036,67372036]
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41-NEXT: por %xmm2, %xmm1
; SSE41-NEXT: pshufb %xmm1, %xmm0
@@ -1261,7 +1261,7 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwi
;
; SSE41-LABEL: var_shuffle_v4f32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [67372036,67372036,67372036,67372036]
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41-NEXT: pshufb %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -1339,7 +1339,7 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
; SSE41-NEXT: pmaxud %xmm1, %xmm2
; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [67372036,67372036,67372036,67372036]
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41-NEXT: por %xmm2, %xmm1
; SSE41-NEXT: pshufb %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vec_reassociate.ll b/llvm/test/CodeGen/X86/vec_reassociate.ll
index a9473fff..4703ca3 100644
--- a/llvm/test/CodeGen/X86/vec_reassociate.ll
+++ b/llvm/test/CodeGen/X86/vec_reassociate.ll
@@ -38,13 +38,13 @@ define <4 x i32> @mul_4i32(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: mul_4i32:
; X86: # %bb.0:
; X86-NEXT: pmulld %xmm1, %xmm0
-; X86-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [4,6,6,4]
; X86-NEXT: retl
;
; X64-LABEL: mul_4i32:
; X64: # %bb.0:
; X64-NEXT: pmulld %xmm1, %xmm0
-; X64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,6,6,4]
; X64-NEXT: retq
%1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 3, i32 4>
%2 = mul <4 x i32> %a1, <i32 4, i32 3, i32 2, i32 1>
@@ -56,13 +56,13 @@ define <4 x i32> @mul_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: mul_4i32_commute:
; X86: # %bb.0:
; X86-NEXT: pmulld %xmm1, %xmm0
-; X86-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [4,6,6,4]
; X86-NEXT: retl
;
; X64-LABEL: mul_4i32_commute:
; X64: # %bb.0:
; X64-NEXT: pmulld %xmm1, %xmm0
-; X64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,6,6,4]
; X64-NEXT: retq
%1 = mul <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %a0
%2 = mul <4 x i32> <i32 4, i32 3, i32 2, i32 1>, %a1
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index ac932d5..1a63515 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -1090,7 +1090,6 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1
; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1335,7 +1334,6 @@ define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8>
; AVX2-NEXT: andq $-32, %rsp
; AVX2-NEXT: subq $64, %rsp
; AVX2-NEXT: vpsllw $7, %ymm1, %ymm1
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm3
; AVX2-NEXT: vmovaps %ymm2, (%rsp)
@@ -4733,7 +4731,6 @@ define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) nounwind {
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
@@ -4751,72 +4748,7 @@ define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) nounwind {
; AVX2-NEXT: vpextrb $3, %xmm1, %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: vpextrb $4, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT: vpextrb $4, %xmm1, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vpextrb $5, %xmm1, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $5, %xmm0, -24(%rsp,%rax)
-; AVX2-NEXT: vpextrb $6, %xmm1, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $6, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT: vpextrb $7, %xmm1, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $7, %xmm0, -24(%rsp,%rax)
-; AVX2-NEXT: vpextrb $8, %xmm1, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $8, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $9, %xmm0, -24(%rsp,%rax)
-; AVX2-NEXT: vpextrb $10, %xmm1, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $10, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT: vpextrb $11, %xmm1, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $11, %xmm0, -24(%rsp,%rax)
-; AVX2-NEXT: vpextrb $12, %xmm1, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $12, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $13, %xmm0, -24(%rsp,%rax)
-; AVX2-NEXT: vpextrb $14, %xmm1, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addl %ecx, %eax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $14, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $15, %xmm0, -24(%rsp,%rax)
+; AVX2-NEXT: vpextrb $15, %xmm0, -24(%rsp,%rcx)
; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index 762900e..a0c2760 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -1821,9 +1821,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,u,128,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: por %xmm1, %xmm0
@@ -1841,7 +1841,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE41-NEXT: psrld $28, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
@@ -1854,7 +1854,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -1935,9 +1935,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,64,128]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [32,u,128,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X86-SSE2-NEXT: por %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 445e572..2fadf5f 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -1647,7 +1647,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [256,512,1024,2048]
; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsrld $25, %xmm1, %xmm3
; AVX1-NEXT: vpsrld $27, %xmm1, %xmm4
@@ -1656,7 +1656,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index d0690bd..ec2efcd 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -1302,9 +1302,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; SSE2-LABEL: constant_funnnel_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1316,8 +1316,8 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; SSE41-LABEL: constant_funnnel_v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1328,8 +1328,8 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,128,u]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1394,9 +1394,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; X86-SSE2-LABEL: constant_funnnel_v4i32:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,64,128]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,128,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 421fa98..5f7e407 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -1082,13 +1082,13 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,128,u]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [512,u,2048,u]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,512,1024,2048]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovshdup {{.*#+}} ymm2 = ymm0[1,1,3,3,5,5,7,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
index b378dce..304daab 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
@@ -319,9 +319,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE2-LABEL: constant_funnnel_v2i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -333,8 +333,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE41-LABEL: constant_funnnel_v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -345,8 +345,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v2i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,1,u]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -411,9 +411,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; X86-SSE2-LABEL: constant_funnnel_v2i32:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,1,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
index 06ff7e7..ae5dd18 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
@@ -500,9 +500,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-NEXT: psrld $27, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,u,1,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: por %xmm1, %xmm0
@@ -514,7 +514,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE41-NEXT: psrld $27, %xmm2
; SSE41-NEXT: psrld $28, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
@@ -523,7 +523,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; AVX1-NEXT: vpsrld $27, %xmm1, %xmm2
; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -598,9 +598,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; X86-SSE2-NEXT: psrld $27, %xmm2
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [32,u,1,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X86-SSE2-NEXT: por %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index d16b28a..33a6a76 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1741,9 +1741,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,67108864,33554432]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [134217728,u,33554432,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: por %xmm1, %xmm0
@@ -1761,7 +1761,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE41-NEXT: psrld $4, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,67108864,33554432]
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
@@ -1774,7 +1774,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,67108864,33554432]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -1856,9 +1856,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,67108864,33554432]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [134217728,u,33554432,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X86-SSE2-NEXT: por %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index a387562..217431be 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -1403,7 +1403,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [16777216,8388608,4194304,2097152]
; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsrld $7, %xmm1, %xmm3
; AVX1-NEXT: vpsrld $5, %xmm1, %xmm4
@@ -1412,7 +1412,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,67108864,33554432]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 4969cb5..5d01dfd 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -1380,9 +1380,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; SSE2-LABEL: constant_funnnel_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,67108864,33554432]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,33554432,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1394,8 +1394,8 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; SSE41-LABEL: constant_funnnel_v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,33554432,u]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,67108864,33554432]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1406,8 +1406,8 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,33554432,u]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,67108864,33554432]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1472,9 +1472,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; X86-SSE2-LABEL: constant_funnnel_v4i32:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,67108864,33554432]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,u,33554432,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index e2a3e26..4dc931d 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -1134,13 +1134,13 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,33554432,u]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [8388608,u,2097152,u]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,67108864,33554432]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [16777216,8388608,4194304,2097152]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovshdup {{.*#+}} ymm2 = ymm0[1,1,3,3,5,5,7,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
index ef5ffe4..4b42b18 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
@@ -341,9 +341,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE2-LABEL: constant_funnnel_v2i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -355,8 +355,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE41-LABEL: constant_funnnel_v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -367,8 +367,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v2i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,1,u]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -433,9 +433,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; X86-SSE2-LABEL: constant_funnnel_v2i32:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,1,1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,u,1,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 816d5ca..e68d1d7 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -171,7 +171,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; SSE-NEXT: pmulhw %xmm3, %xmm2
; SSE-NEXT: psrlw $8, %xmm2
; SSE-NEXT: pxor %xmm4, %xmm4
@@ -193,7 +193,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -260,11 +260,11 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
+; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,147,0,79,0,43,0,187,0,103,0,57,0,57,0,147]
; SSE-NEXT: psrlw $8, %xmm2
; SSE-NEXT: pxor %xmm3, %xmm3
; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
; SSE-NEXT: psrlw $8, %xmm3
; SSE-NEXT: packuswb %xmm2, %xmm3
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -287,10 +287,10 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,147,0,79,0,43,0,187,0,103,0,57,0,57,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -561,7 +561,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; SSE-NEXT: pmulhw %xmm3, %xmm2
; SSE-NEXT: psrlw $8, %xmm2
; SSE-NEXT: pxor %xmm4, %xmm4
@@ -588,7 +588,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -667,11 +667,11 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,20224,11008,47872,26368,14592,14592,37632]
+; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,147,0,79,0,43,0,187,0,103,0,57,0,57,0,147]
; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: packuswb %xmm1, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255]
@@ -706,11 +706,11 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
+; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,147,0,79,0,43,0,187,0,103,0,57,0,57,0,147]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
; SSE41-NEXT: psrlw $8, %xmm3
; SSE41-NEXT: packuswb %xmm2, %xmm3
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255]
@@ -741,10 +741,10 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,147,0,79,0,43,0,187,0,103,0,57,0,57,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
@@ -1116,11 +1116,11 @@ define <16 x i8> @PR143238(<16 x i8> %a0) {
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [26368,47872,11008,20224,37632,35072,33024,30976]
+; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,103,0,187,0,43,0,79,0,147,0,137,0,129,0,121]
; SSE-NEXT: psrlw $8, %xmm2
; SSE-NEXT: pxor %xmm3, %xmm3
; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [33024,22016,33024,26368,11008,37632,33024,14592]
+; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,129,0,86,0,129,0,103,0,43,0,147,0,129,0,57]
; SSE-NEXT: psrlw $8, %xmm3
; SSE-NEXT: packuswb %xmm2, %xmm3
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1144,10 +1144,10 @@ define <16 x i8> @PR143238(<16 x i8> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [26368,47872,11008,20224,37632,35072,33024,30976]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,103,0,187,0,43,0,79,0,147,0,137,0,129,0,121]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [33024,22016,33024,26368,11008,37632,33024,14592]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,129,0,86,0,129,0,103,0,43,0,147,0,129,0,57]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index 63c69e5..7355f36 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -161,7 +161,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX1-NEXT: vpmulhw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
@@ -198,7 +198,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX2NOBW-NEXT: vpmulhw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
@@ -245,10 +245,10 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
@@ -266,10 +266,10 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4
; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [35072,33024,30976,14592,6912,26368,12544,47872]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
@@ -291,10 +291,10 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [35072,33024,30976,14592,6912,26368,12544,47872,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37632,33024,14592,26368,47872,11008,20224,37632,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -539,7 +539,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX1-NEXT: vpmulhw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -585,7 +585,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX2NOBW-NEXT: vpmulhw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
@@ -640,10 +640,10 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4
@@ -668,10 +668,10 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [35072,33024,30976,14592,6912,26368,12544,47872]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3
@@ -699,10 +699,10 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [35072,33024,30976,14592,6912,26368,12544,47872,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37632,33024,14592,26368,47872,11008,20224,37632,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index 6bc4fcb..5445330 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -132,7 +132,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX512F-NEXT: vpmulhw %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
@@ -169,7 +169,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX512BW-NEXT: vpmulhw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
@@ -199,10 +199,10 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
-; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [35072,18176,37632,4864,20224,10496,11008,45824,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,137,0,71,0,147,0,19,0,79,0,41,0,43,0,179,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
-; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [6912,28416,14592,15104,30976,32000,33024,34048,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,27,0,111,0,57,0,59,0,121,0,125,0,129,0,133,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
@@ -220,10 +220,10 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm4
; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912]
+; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,133,0,129,0,125,0,121,0,59,0,57,0,111,0,27]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072]
+; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,179,0,43,0,41,0,79,0,19,0,147,0,71,0,137]
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
@@ -245,10 +245,10 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912,35072,18176,37632,4864,20224,10496,11008,45824,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,133,0,129,0,125,0,121,0,59,0,57,0,111,0,27,0,137,0,71,0,147,0,19,0,79,0,41,0,43,0,179,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
-; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072,6912,28416,14592,15104,30976,32000,33024,34048,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,179,0,43,0,41,0,79,0,19,0,147,0,71,0,137,0,27,0,111,0,57,0,59,0,121,0,125,0,129,0,133,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
@@ -444,7 +444,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX512F-NEXT: vpmulhw %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
@@ -490,7 +490,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX512BW-NEXT: vpmulhw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
@@ -524,10 +524,10 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
-; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [35072,18176,37632,4864,20224,10496,11008,45824,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,137,0,71,0,147,0,19,0,79,0,41,0,43,0,179,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
-; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [6912,28416,14592,15104,30976,32000,33024,34048,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,27,0,111,0,57,0,59,0,121,0,125,0,129,0,133,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4
@@ -552,10 +552,10 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpor %ymm3, %ymm5, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912]
+; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,133,0,129,0,125,0,121,0,59,0,57,0,111,0,27]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072]
+; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,179,0,43,0,41,0,79,0,19,0,147,0,71,0,137]
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3
; AVX512F-NEXT: vpaddb %ymm4, %ymm3, %ymm3
@@ -583,10 +583,10 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912,35072,18176,37632,4864,20224,10496,11008,45824,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,133,0,129,0,125,0,121,0,59,0,57,0,111,0,27,0,137,0,71,0,147,0,19,0,79,0,41,0,43,0,179,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
-; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072,6912,28416,14592,15104,30976,32000,33024,34048,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,179,0,43,0,41,0,79,0,19,0,147,0,71,0,137,0,27,0,111,0,57,0,59,0,121,0,125,0,129,0,133,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
index 33d80f6..6cd5098 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -169,7 +169,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; SSE2-NEXT: pmullw %xmm3, %xmm2
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm4
@@ -209,7 +209,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -270,22 +270,22 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,256,256,256,256,256,256,256]
; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,79,171,117,205,57,57,37]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,256,256,256,256,256,256,128]
; SSE2-NEXT: psrlw $8, %xmm3
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,57,205,117,171,79,147]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: packuswb %xmm2, %xmm3
; SSE2-NEXT: psubb %xmm3, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,128,0,0,0,128]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,0,0,128,0,0,0]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: paddb %xmm3, %xmm0
@@ -309,7 +309,7 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: psllw $7, %xmm3
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7]
; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,79,171,117,205,57,57,37]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
; SSE41-NEXT: psrlw $8, %xmm3
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -317,15 +317,15 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: psllw $7, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,6],xmm4[7]
; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,32,57,205,117,171,79,147]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
; SSE41-NEXT: psrlw $8, %xmm4
; SSE41-NEXT: packuswb %xmm3, %xmm4
; SSE41-NEXT: psubb %xmm4, %xmm0
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,0,0,128]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,128,0,0,0]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm0, %xmm2
; SSE41-NEXT: paddb %xmm4, %xmm2
@@ -346,22 +346,22 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,79,171,117,205,57,57,37]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
@@ -638,7 +638,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; SSE2-NEXT: pmullw %xmm3, %xmm2
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm4
@@ -690,7 +690,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -763,23 +763,23 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,256,256,256,256,256,256,256]
; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,79,171,117,205,57,57,37]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,256,256,256,256,256,256,128]
; SSE2-NEXT: psrlw $8, %xmm3
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,57,205,117,171,79,147]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: packuswb %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psubb %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,0,0,128,0,0,0,128]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; SSE2-NEXT: psrlw $8, %xmm4
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,128,0,0,0]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: packuswb %xmm4, %xmm2
; SSE2-NEXT: paddb %xmm3, %xmm2
@@ -809,7 +809,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: psllw $7, %xmm3
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7]
; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,79,171,117,205,57,57,37]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
; SSE41-NEXT: psrlw $8, %xmm3
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -817,16 +817,16 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: psllw $7, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,6],xmm4[7]
; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,32,57,205,117,171,79,147]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
; SSE41-NEXT: psrlw $8, %xmm4
; SSE41-NEXT: packuswb %xmm3, %xmm4
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psubb %xmm4, %xmm2
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,128,0,0,0,128]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [128,0,0,0,128,0,0,0]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; SSE41-NEXT: psrlw $8, %xmm3
; SSE41-NEXT: packuswb %xmm2, %xmm3
; SSE41-NEXT: paddb %xmm4, %xmm3
@@ -854,22 +854,22 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,79,171,117,205,57,57,37]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
index e43108f..98ea87c 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -166,7 +166,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -200,7 +200,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -246,22 +246,22 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [147,79,171,117,205,57,32,37]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [187,135,205,27,57,241,16,137]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,128,0,0,0,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
@@ -276,22 +276,22 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [137,16,241,57,27,205,135,187]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,0,0,0,128,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
@@ -312,20 +312,20 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [137,16,241,57,27,205,135,187,147,79,171,117,205,57,32,37]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm4, %ymm3
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,32,57,205,117,171,79,147,187,135,205,27,57,241,16,137]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,0,0,0,0,0,128,0,0,0,0,128,0,0,0,128]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,128,0,0,0,0,128,0,0,0,0,0,0]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
@@ -578,7 +578,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
@@ -622,7 +622,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -676,22 +676,22 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [147,79,171,117,205,57,32,37]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [187,135,205,27,57,241,16,137]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,128,0,0,0,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
@@ -713,22 +713,22 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [137,16,241,57,27,205,135,187]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpsllw $7, %xmm6, %xmm6
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6],xmm6[7]
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [0,0,0,0,0,0,128,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX1-NEXT: vpackuswb %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
@@ -755,20 +755,20 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [137,16,241,57,27,205,135,187,147,79,171,117,205,57,32,37]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm4, %ymm3
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,32,57,205,117,171,79,147,187,135,205,27,57,241,16,137]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm0, %ymm3
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,128,0,0,0,0,128,0,0,0,128]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [128,0,0,0,128,0,0,0,0,128,0,0,0,0,0,0]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpaddb %ymm2, %ymm3, %ymm2
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index bf98bcc..a11fa370 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -135,7 +135,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -169,7 +169,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
@@ -199,20 +199,20 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,64,256,256,256,256,256,128,256,256,256,256,256,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,0,27,0,37,0,19,0,79,0,41,0,171,0,101,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,256,128,256,256,256,256,256,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,0,111,0,57,0,235,0,241,0,249,0,8,0,9,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
@@ -226,20 +226,20 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,256,256,256,256,256,128,256,256,256,256,256,256,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,9,0,8,0,249,0,241,0,235,0,57,0,111,0,27,0]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,128,256,256,256,256,256,64,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,101,0,171,0,41,0,79,0,19,0,37,0,27,0,137,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
@@ -259,20 +259,20 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27,137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,9,0,8,0,249,0,241,0,235,0,57,0,111,0,27,0,137,0,27,0,37,0,19,0,79,0,41,0,171,0,101,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137,27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,101,0,171,0,41,0,79,0,19,0,37,0,27,0,137,0,27,0,111,0,57,0,235,0,241,0,249,0,8,0,9,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0,0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0,0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
@@ -473,7 +473,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
@@ -517,7 +517,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
@@ -551,20 +551,20 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,64,256,256,256,256,256,128,256,256,256,256,256,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,0,27,0,37,0,19,0,79,0,41,0,171,0,101,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,256,128,256,256,256,256,256,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,0,111,0,57,0,235,0,241,0,249,0,8,0,9,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm4
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm5, %ymm4, %ymm4
; AVX512F-NEXT: vpaddb %ymm3, %ymm4, %ymm3
@@ -585,20 +585,20 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,128,256,256,256,256,256,256,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,9,0,8,0,249,0,241,0,235,0,57,0,111,0,27,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [256,256,256,256,256,256,256,128,256,256,256,256,256,64,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,101,0,171,0,41,0,79,0,19,0,37,0,27,0,137,0]
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4
; AVX512F-NEXT: vpsubb %ymm4, %ymm0, %ymm5
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15],ymm5[24],ymm1[24],ymm5[25],ymm1[25],ymm5[26],ymm1[26],ymm5[27],ymm1[27],ymm5[28],ymm1[28],ymm5[29],ymm1[29],ymm5[30],ymm1[30],ymm5[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[16],ymm1[16],ymm5[17],ymm1[17],ymm5[18],ymm1[18],ymm5[19],ymm1[19],ymm5[20],ymm1[20],ymm5[21],ymm1[21],ymm5[22],ymm1[22],ymm5[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpackuswb %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpaddb %ymm4, %ymm5, %ymm4
@@ -624,20 +624,20 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27,137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,9,0,8,0,249,0,241,0,235,0,57,0,111,0,27,0,137,0,27,0,37,0,19,0,79,0,41,0,171,0,101,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137,27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,101,0,171,0,41,0,79,0,19,0,37,0,27,0,137,0,27,0,111,0,57,0,235,0,241,0,249,0,8,0,9,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm3
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm3[8],zmm1[8],zmm3[9],zmm1[9],zmm3[10],zmm1[10],zmm3[11],zmm1[11],zmm3[12],zmm1[12],zmm3[13],zmm1[13],zmm3[14],zmm1[14],zmm3[15],zmm1[15],zmm3[24],zmm1[24],zmm3[25],zmm1[25],zmm3[26],zmm1[26],zmm3[27],zmm1[27],zmm3[28],zmm1[28],zmm3[29],zmm1[29],zmm3[30],zmm1[30],zmm3[31],zmm1[31],zmm3[40],zmm1[40],zmm3[41],zmm1[41],zmm3[42],zmm1[42],zmm3[43],zmm1[43],zmm3[44],zmm1[44],zmm3[45],zmm1[45],zmm3[46],zmm1[46],zmm3[47],zmm1[47],zmm3[56],zmm1[56],zmm3[57],zmm1[57],zmm3[58],zmm1[58],zmm3[59],zmm1[59],zmm3[60],zmm1[60],zmm3[61],zmm1[61],zmm3[62],zmm1[62],zmm3[63],zmm1[63]
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm4 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0,0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm4 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm4, %zmm4
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm1[0],zmm3[1],zmm1[1],zmm3[2],zmm1[2],zmm3[3],zmm1[3],zmm3[4],zmm1[4],zmm3[5],zmm1[5],zmm3[6],zmm1[6],zmm3[7],zmm1[7],zmm3[16],zmm1[16],zmm3[17],zmm1[17],zmm3[18],zmm1[18],zmm3[19],zmm1[19],zmm3[20],zmm1[20],zmm3[21],zmm1[21],zmm3[22],zmm1[22],zmm3[23],zmm1[23],zmm3[32],zmm1[32],zmm3[33],zmm1[33],zmm3[34],zmm1[34],zmm3[35],zmm1[35],zmm3[36],zmm1[36],zmm3[37],zmm1[37],zmm3[38],zmm1[38],zmm3[39],zmm1[39],zmm3[48],zmm1[48],zmm3[49],zmm1[49],zmm3[50],zmm1[50],zmm3[51],zmm1[51],zmm3[52],zmm1[52],zmm3[53],zmm1[53],zmm3[54],zmm1[54],zmm3[55],zmm1[55]
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0,0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpackuswb %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpaddb %zmm2, %zmm3, %zmm2
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 6e1bf25..d0bb90c 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -130,31 +130,31 @@ define <4 x i32> @mul_v4i32_1_2_4_8(<4 x i32> %a0) nounwind {
; X86-SSE2-LABEL: mul_v4i32_1_2_4_8:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [2,u,8,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
;
; X86-SSE4-LABEL: mul_v4i32_1_2_4_8:
; X86-SSE4: # %bb.0:
-; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8]
; X86-SSE4-NEXT: retl
;
; X64-SSE2-LABEL: mul_v4i32_1_2_4_8:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,u,8,u]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE2-NEXT: retq
;
; X64-SSE4-LABEL: mul_v4i32_1_2_4_8:
; X64-SSE4: # %bb.0:
-; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8]
; X64-SSE4-NEXT: retq
;
; X64-XOP-LABEL: mul_v4i32_1_2_4_8:
@@ -190,12 +190,12 @@ define <4 x i32> @mul_v4i32_1_2_4_8_optsize(<4 x i32> %a0) nounwind optsize {
;
; X86-SSE4-LABEL: mul_v4i32_1_2_4_8_optsize:
; X86-SSE4: # %bb.0:
-; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8]
; X86-SSE4-NEXT: retl
;
; X64-SSE4-LABEL: mul_v4i32_1_2_4_8_optsize:
; X64-SSE4: # %bb.0:
-; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8]
; X64-SSE4-NEXT: retq
;
; X64-XOP-LABEL: mul_v4i32_1_2_4_8_optsize:
@@ -989,7 +989,7 @@ define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v2i64_17_65:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [17,65]
; X64-AVX512DQ-NEXT: retq
%1 = mul <2 x i64> %a0, <i64 17, i64 65>
ret <2 x i64> %1
@@ -999,36 +999,36 @@ define <4 x i32> @mul_v4i32_5_17_33_65(<4 x i32> %a0) nounwind {
; X86-SSE2-LABEL: mul_v4i32_5_17_33_65:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [5,17,33,65]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [17,u,65,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
;
; X86-SSE4-LABEL: mul_v4i32_5_17_33_65:
; X86-SSE4: # %bb.0:
-; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [5,17,33,65]
; X86-SSE4-NEXT: retl
;
; X64-SSE2-LABEL: mul_v4i32_5_17_33_65:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [5,17,33,65]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [17,u,65,u]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE2-NEXT: retq
;
; X64-SSE4-LABEL: mul_v4i32_5_17_33_65:
; X64-SSE4: # %bb.0:
-; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [5,17,33,65]
; X64-SSE4-NEXT: retq
;
; X64-AVX-LABEL: mul_v4i32_5_17_33_65:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [5,17,33,65]
; X64-AVX-NEXT: retq
%1 = mul <4 x i32> %a0, <i32 5, i32 17, i32 33, i32 65>
ret <4 x i32> %1
@@ -1384,7 +1384,7 @@ define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v2i64_15_63:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [15,63]
; X64-AVX512DQ-NEXT: retq
%1 = mul <2 x i64> %a0, <i64 15, i64 63>
ret <2 x i64> %1
@@ -1427,7 +1427,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
; X64-SSE2-NEXT: psrlq $32, %xmm3
; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4294967295,4294967295]
; X64-SSE2-NEXT: paddq %xmm3, %xmm0
; X64-SSE2-NEXT: psllq $32, %xmm0
; X64-SSE2-NEXT: paddq %xmm2, %xmm0
@@ -1441,7 +1441,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
; X64-SSE4-NEXT: movdqa %xmm0, %xmm3
; X64-SSE4-NEXT: psrlq $32, %xmm3
; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4294967295,4294967295]
; X64-SSE4-NEXT: paddq %xmm3, %xmm0
; X64-SSE4-NEXT: psllq $32, %xmm0
; X64-SSE4-NEXT: paddq %xmm2, %xmm0
@@ -1453,7 +1453,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4294967295,4294967295]
; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -1465,7 +1465,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4294967295,4294967295]
; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -1473,7 +1473,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v2i64_neg_15_63:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18446744073709551601,18446744073709551553]
; X64-AVX512DQ-NEXT: retq
%1 = mul <2 x i64> %a0, <i64 -15, i64 -63>
ret <2 x i64> %1
@@ -1516,7 +1516,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
; X64-SSE2-NEXT: psrlq $32, %xmm3
; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4294967295,4294967295]
; X64-SSE2-NEXT: paddq %xmm3, %xmm0
; X64-SSE2-NEXT: psllq $32, %xmm0
; X64-SSE2-NEXT: paddq %xmm2, %xmm0
@@ -1530,7 +1530,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
; X64-SSE4-NEXT: movdqa %xmm0, %xmm3
; X64-SSE4-NEXT: psrlq $32, %xmm3
; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4294967295,4294967295]
; X64-SSE4-NEXT: paddq %xmm3, %xmm0
; X64-SSE4-NEXT: psllq $32, %xmm0
; X64-SSE4-NEXT: paddq %xmm2, %xmm0
@@ -1542,7 +1542,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4294967295,4294967295]
; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -1554,7 +1554,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4294967295,4294967295]
; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -1562,7 +1562,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v2i64_neg_17_65:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18446744073709551599,18446744073709551551]
; X64-AVX512DQ-NEXT: retq
%1 = mul <2 x i64> %a0, <i64 -17, i64 -65>
ret <2 x i64> %1
@@ -1600,7 +1600,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
; X86-SSE2-NEXT: psrlq $32, %xmm3
; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,u,u,u,u,255,255,255,255,u,u,u,u]
; X86-SSE2-NEXT: paddq %xmm3, %xmm0
; X86-SSE2-NEXT: psllq $32, %xmm0
; X86-SSE2-NEXT: paddq %xmm2, %xmm0
@@ -1614,7 +1614,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
; X86-SSE4-NEXT: movdqa %xmm0, %xmm3
; X86-SSE4-NEXT: psrlq $32, %xmm3
; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,u,u,u,u,255,255,255,255,u,u,u,u]
; X86-SSE4-NEXT: paddq %xmm3, %xmm0
; X86-SSE4-NEXT: psllq $32, %xmm0
; X86-SSE4-NEXT: paddq %xmm2, %xmm0
@@ -1628,7 +1628,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
; X64-SSE2-NEXT: psrlq $32, %xmm3
; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
; X64-SSE2-NEXT: paddq %xmm3, %xmm0
; X64-SSE2-NEXT: psllq $32, %xmm0
; X64-SSE2-NEXT: paddq %xmm2, %xmm0
@@ -1642,7 +1642,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
; X64-SSE4-NEXT: movdqa %xmm0, %xmm3
; X64-SSE4-NEXT: psrlq $32, %xmm3
; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
; X64-SSE4-NEXT: paddq %xmm3, %xmm0
; X64-SSE4-NEXT: psllq $32, %xmm0
; X64-SSE4-NEXT: paddq %xmm2, %xmm0
@@ -1654,7 +1654,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -1666,7 +1666,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -1674,7 +1674,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v2i64_neg_0_1:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
; X64-AVX512DQ-NEXT: retq
%1 = mul <2 x i64> %a0, <i64 0, i64 -1>
ret <2 x i64> %1
@@ -1689,7 +1689,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
; X86-SSE2-NEXT: psrlq $32, %xmm3
; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,u,u,u,u,255,255,255,255,u,u,u,u]
; X86-SSE2-NEXT: paddq %xmm3, %xmm0
; X86-SSE2-NEXT: psllq $32, %xmm0
; X86-SSE2-NEXT: paddq %xmm2, %xmm0
@@ -1703,7 +1703,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
; X86-SSE4-NEXT: movdqa %xmm0, %xmm3
; X86-SSE4-NEXT: psrlq $32, %xmm3
; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,u,u,u,u,255,255,255,255,u,u,u,u]
; X86-SSE4-NEXT: paddq %xmm3, %xmm0
; X86-SSE4-NEXT: psllq $32, %xmm0
; X86-SSE4-NEXT: paddq %xmm2, %xmm0
@@ -1717,7 +1717,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
; X64-SSE2-NEXT: psrlq $32, %xmm3
; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
; X64-SSE2-NEXT: paddq %xmm3, %xmm0
; X64-SSE2-NEXT: psllq $32, %xmm0
; X64-SSE2-NEXT: paddq %xmm2, %xmm0
@@ -1731,7 +1731,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
; X64-SSE4-NEXT: movdqa %xmm0, %xmm3
; X64-SSE4-NEXT: psrlq $32, %xmm3
; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
; X64-SSE4-NEXT: paddq %xmm3, %xmm0
; X64-SSE4-NEXT: psllq $32, %xmm0
; X64-SSE4-NEXT: paddq %xmm2, %xmm0
@@ -1743,7 +1743,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -1755,7 +1755,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -1763,7 +1763,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v2i64_15_neg_63:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [15,18446744073709551553]
; X64-AVX512DQ-NEXT: retq
%1 = mul <2 x i64> %a0, <i64 15, i64 -63>
ret <2 x i64> %1
@@ -1773,36 +1773,36 @@ define <4 x i32> @mul_v4i32_0_15_31_7(<4 x i32> %a0) nounwind {
; X86-SSE2-LABEL: mul_v4i32_0_15_31_7:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,15,31,7]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [15,u,7,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
;
; X86-SSE4-LABEL: mul_v4i32_0_15_31_7:
; X86-SSE4: # %bb.0:
-; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,15,31,7]
; X86-SSE4-NEXT: retl
;
; X64-SSE2-LABEL: mul_v4i32_0_15_31_7:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,15,31,7]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [15,u,7,u]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE2-NEXT: retq
;
; X64-SSE4-LABEL: mul_v4i32_0_15_31_7:
; X64-SSE4: # %bb.0:
-; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,15,31,7]
; X64-SSE4-NEXT: retq
;
; X64-AVX-LABEL: mul_v4i32_0_15_31_7:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,15,31,7]
; X64-AVX-NEXT: retq
%1 = mul <4 x i32> %a0, <i32 0, i32 15, i32 31, i32 7>
ret <4 x i32> %1
@@ -1947,7 +1947,7 @@ define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v2i64_68_132:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [68,132]
; X64-AVX512DQ-NEXT: retq
%mul = mul <2 x i64> %x, <i64 68, i64 132>
ret <2 x i64> %mul
@@ -2009,7 +2009,7 @@ define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v2i64_60_120:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [60,124]
; X64-AVX512DQ-NEXT: retq
%mul = mul <2 x i64> %x, <i64 60, i64 124>
ret <2 x i64> %mul
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index 93f4ce7..0bf5a8d 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -1092,9 +1092,9 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: constant_rotate_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1106,8 +1106,8 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; SSE41-LABEL: constant_rotate_v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1118,8 +1118,8 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; AVX1-LABEL: constant_rotate_v4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,128,u]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1156,9 +1156,9 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; X86-SSE2-LABEL: constant_rotate_v4i32:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,64,128]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,128,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index 64c3118..5ae3e2f 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -895,13 +895,13 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: constant_rotate_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,128,u]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [512,u,2048,u]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,512,1024,2048]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovshdup {{.*#+}} ymm2 = ymm0[1,1,3,3,5,5,7,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index 99dac74..3085c32 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -987,21 +987,21 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i32:
@@ -1032,9 +1032,9 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; X86-SSE-LABEL: constant_shift_v4i32:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,64,128]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,128,u]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index b56a8b5..f9ccd1e 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -1117,9 +1117,9 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: constant_shift_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [16,32,64,128]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,512,256,128]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -1153,9 +1153,9 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
;
; X86-AVX1-LABEL: constant_shift_v8i32:
; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [16,32,64,128]
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X86-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [256,512,256,128]
; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X86-AVX1-NEXT: retl
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 0e20b18..18d79b6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -778,9 +778,9 @@ define <16 x i8> @combine_shl_pshufb(<4 x i32> %a0) {
; SSSE3-LABEL: combine_shl_pshufb:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,256,65536,65536]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,u,65536,u]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
@@ -788,13 +788,13 @@ define <16 x i8> @combine_shl_pshufb(<4 x i32> %a0) {
;
; SSE41-LABEL: combine_shl_pshufb:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,256,65536,65536]
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_shl_pshufb:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,256,65536,65536]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll
index 1af7542..4235377 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-math.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll
@@ -2110,7 +2110,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm2, %xmm2
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
@@ -2119,7 +2119,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -2127,7 +2127,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -2135,7 +2135,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3]
; AVX2-FAST-ALL-NEXT: vzeroupper
; AVX2-FAST-ALL-NEXT: retq
;
@@ -2143,7 +2143,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX2-FAST-PERLANE: # %bb.0:
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-FAST-PERLANE-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vzeroupper
; AVX2-FAST-PERLANE-NEXT: retq
;
@@ -2151,7 +2151,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
@@ -2253,13 +2253,13 @@ define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,3]
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [4,5]
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [6,7]
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [8,9]
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [10,11]
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [12,13]
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [14,15]
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm8, %xmm7
; SSE-NEXT: pand %xmm8, %xmm6
@@ -2280,18 +2280,18 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 # [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,3]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5 # [4,5]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm6
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [6,7]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm6 # [8,9]
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [10,11]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7 # [12,13]
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [14,15]
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm8 = [255,255]
; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
@@ -2313,10 +2313,10 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,3]
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [4,5,6,7]
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [8,9,10,11]
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [12,13,14,15]
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
@@ -2335,8 +2335,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [8,9,10,11,12,13,14,15]
; AVX512F-NEXT: vpmovqb %zmm1, %xmm1
; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -2345,8 +2345,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -2355,8 +2355,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [8,9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1
; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -2371,27 +2371,27 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [1,u,3,u]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [5,u,7,u]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [8,9,10,11]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [9,u,11,u]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [12,13,14,15]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [13,u,15,u]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
@@ -2406,12 +2406,12 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
;
; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,1,2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,5,6,7]
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 # [8,9,10,11]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [12,13,14,15]
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255]
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
@@ -2425,8 +2425,8 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
;
; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,3,4,5,6,7]
+; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [8,9,10,11,12,13,14,15]
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
@@ -2439,7 +2439,7 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
;
; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll
index 17315c4..ac330a7 100644
--- a/llvm/test/CodeGen/X86/vselect-avx.ll
+++ b/llvm/test/CodeGen/X86/vselect-avx.ll
@@ -95,7 +95,7 @@ bb:
define void @test3(<4 x i32> %induction30, ptr %tmp16, ptr %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) {
; AVX1-LABEL: test3:
; AVX1: ## %bb.0:
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## [2863311531,2863311531,2863311531,2863311531]
; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
@@ -151,23 +151,19 @@ define <32 x i8> @PR22706(<32 x i1> %x) {
; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
-; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtb %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR22706:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll
index 8543e9f..16700d4 100644
--- a/llvm/test/CodeGen/X86/vselect-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll
@@ -1046,7 +1046,7 @@ define <2 x i64> @blend_mask_cond_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z
define <4 x i32> @blend_mask_cond_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; AVX1-LABEL: blend_mask_cond_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,4194304,1073741824,2147483648]
; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: retq
;
@@ -1211,9 +1211,9 @@ define <4 x i64> @blend_mask_cond_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z
define <8 x i32> @blend_mask_cond_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
; AVX1-LABEL: blend_mask_cond_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 # [2147483648,1073741824,268435456,536870912]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,2097152,1073741824,524288]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/zero-call-used-regs-simd.ll b/llvm/test/CodeGen/X86/zero-call-used-regs-simd.ll
new file mode 100644
index 0000000..d9253e0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/zero-call-used-regs-simd.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vl -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vl,+avx512bw -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512,AVX512BW
+
+define void @zero_xmm(<4 x i32> %arg) #0 {
+; SSE-LABEL: zero_xmm:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps %xmm0, 0
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: zero_xmm:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps %xmm0, 0
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: zero_xmm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps %xmm0, 0
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ store <4 x i32> %arg, ptr null, align 32
+ ret void
+}
+
+define void @zero_ymm(<8 x i32> %arg) #0 {
+; SSE-LABEL: zero_ymm:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps %xmm1, 16
+; SSE-NEXT: movaps %xmm0, 0
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: zero_ymm:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps %ymm0, 0
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: zero_ymm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps %ymm0, 0
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ store <8 x i32> %arg, ptr null, align 32
+ ret void
+}
+
+define void @zero_zmm(<16 x i32> %arg) #0 {
+; SSE-LABEL: zero_zmm:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps %xmm3, 48
+; SSE-NEXT: movaps %xmm2, 32
+; SSE-NEXT: movaps %xmm1, 16
+; SSE-NEXT: movaps %xmm0, 0
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: zero_zmm:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps %ymm1, 32
+; AVX-NEXT: vmovaps %ymm0, 0
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: zero_zmm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovups %zmm0, 0
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ store <16 x i32> %arg, ptr null, align 32
+ ret void
+}
+
+define void @zero_k(<8 x i32> %arg, <8 x i1> %mask) #0 {
+; SSE-LABEL: zero_k:
+; SSE: # %bb.0:
+; SSE-NEXT: psllw $15, %xmm2
+; SSE-NEXT: packsswb %xmm2, %xmm2
+; SSE-NEXT: pmovmskb %xmm2, %eax
+; SSE-NEXT: testb $1, %al
+; SSE-NEXT: jne .LBB3_1
+; SSE-NEXT: # %bb.2: # %else
+; SSE-NEXT: testb $2, %al
+; SSE-NEXT: jne .LBB3_3
+; SSE-NEXT: .LBB3_4: # %else2
+; SSE-NEXT: testb $4, %al
+; SSE-NEXT: jne .LBB3_5
+; SSE-NEXT: .LBB3_6: # %else4
+; SSE-NEXT: testb $8, %al
+; SSE-NEXT: jne .LBB3_7
+; SSE-NEXT: .LBB3_8: # %else6
+; SSE-NEXT: testb $16, %al
+; SSE-NEXT: jne .LBB3_9
+; SSE-NEXT: .LBB3_10: # %else8
+; SSE-NEXT: testb $32, %al
+; SSE-NEXT: jne .LBB3_11
+; SSE-NEXT: .LBB3_12: # %else10
+; SSE-NEXT: testb $64, %al
+; SSE-NEXT: jne .LBB3_13
+; SSE-NEXT: .LBB3_14: # %else12
+; SSE-NEXT: testb $-128, %al
+; SSE-NEXT: je .LBB3_16
+; SSE-NEXT: .LBB3_15: # %cond.store13
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; SSE-NEXT: movd %xmm0, 28
+; SSE-NEXT: .LBB3_16: # %else14
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: retq
+; SSE-NEXT: .LBB3_1: # %cond.store
+; SSE-NEXT: movd %xmm0, 0
+; SSE-NEXT: testb $2, %al
+; SSE-NEXT: je .LBB3_4
+; SSE-NEXT: .LBB3_3: # %cond.store1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; SSE-NEXT: movd %xmm2, 4
+; SSE-NEXT: testb $4, %al
+; SSE-NEXT: je .LBB3_6
+; SSE-NEXT: .LBB3_5: # %cond.store3
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE-NEXT: movd %xmm2, 8
+; SSE-NEXT: testb $8, %al
+; SSE-NEXT: je .LBB3_8
+; SSE-NEXT: .LBB3_7: # %cond.store5
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE-NEXT: movd %xmm0, 12
+; SSE-NEXT: testb $16, %al
+; SSE-NEXT: je .LBB3_10
+; SSE-NEXT: .LBB3_9: # %cond.store7
+; SSE-NEXT: movd %xmm1, 16
+; SSE-NEXT: testb $32, %al
+; SSE-NEXT: je .LBB3_12
+; SSE-NEXT: .LBB3_11: # %cond.store9
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT: movd %xmm0, 20
+; SSE-NEXT: testb $64, %al
+; SSE-NEXT: je .LBB3_14
+; SSE-NEXT: .LBB3_13: # %cond.store11
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: movd %xmm0, 24
+; SSE-NEXT: testb $-128, %al
+; SSE-NEXT: jne .LBB3_15
+; SSE-NEXT: jmp .LBB3_16
+;
+; AVX1-LABEL: zero_k:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmaskmovps %ymm0, %ymm1, 0
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: zero_k:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, 0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: zero_k:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpmovsxwd %xmm1, %ymm1
+; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1
+; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
+; AVX512VL-NEXT: vmovdqa32 %ymm0, 0 {%k1}
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: kxorw %k0, %k0, %k1
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: zero_k:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllw $15, %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovw2m %xmm1, %k1
+; AVX512BW-NEXT: vmovdqa32 %ymm0, 0 {%k1}
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT: kxorq %k0, %k0, %k1
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ tail call void @llvm.masked.store.v8i32.p0(<8 x i32> %arg, ptr null, i32 32, <8 x i1> %mask)
+ ret void
+}
+
+attributes #0 = { "zero-call-used-regs"="used" }
diff --git a/llvm/test/DebugInfo/Generic/compileunit-source-language-name.ll b/llvm/test/DebugInfo/Generic/compileunit-source-language-name.ll
index e2b6167..c8cc871 100644
--- a/llvm/test/DebugInfo/Generic/compileunit-source-language-name.ll
+++ b/llvm/test/DebugInfo/Generic/compileunit-source-language-name.ll
@@ -1,6 +1,10 @@
; RUN: %llc_dwarf -filetype=obj -O0 < %s | llvm-dwarfdump -debug-info - | FileCheck %s --implicit-check-not "DW_AT_language"
-; CHECK: DW_AT_language_name (DW_LNAME_ObjC_plus_plus)
+; CHECK: DW_AT_language_name (DW_LNAME_ObjC_plus_plus)
+; CHECK: DW_AT_language_name (DW_LNAME_C_plus_plus)
+; CHECK: DW_AT_language_version (201100)
+; CHECK: DW_AT_language_name (DW_LNAME_Rust)
+; CHECK-NOT: DW_AT_language_version
@x = global i32 0, align 4, !dbg !0
@@ -9,7 +13,7 @@ define void @_Z4funcv() !dbg !8 {
ret void, !dbg !11
}
-!llvm.dbg.cu = !{!2}
+!llvm.dbg.cu = !{!2, !12, !13}
!llvm.module.flags = !{!6, !7}
!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
@@ -24,3 +28,5 @@ define void @_Z4funcv() !dbg !8 {
!9 = !DISubroutineType(types: !10)
!10 = !{null}
!11 = !DILocation(line: 2, column: 14, scope: !8)
+!12 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 201100, file: !3, producer: "handwritten", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+!13 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_Rust, sourceLanguageVersion: 0, file: !3, producer: "handwritten", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
diff --git a/llvm/test/DebugInfo/X86/shrink-wrap-frame-setup-no-loc.mir b/llvm/test/DebugInfo/X86/shrink-wrap-frame-setup-no-loc.mir
new file mode 100644
index 0000000..b97e916
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/shrink-wrap-frame-setup-no-loc.mir
@@ -0,0 +1,99 @@
+# RUN: %llc_dwarf %s -o - -mtriple=x86_64-unknown-unknown --start-after=livedebugvalues | FileCheck %s
+
+## Check the line number from the ret above `.LBB0_2` doesn't leak onto the
+## frame setup instructions in the `.LBB0_2` block; `pushq %rax` should
+## explicitly get set to line zero.
+
+# CHECK: loop:
+# CHECK-NEXT: .Lfunc_begin0:
+# CHECK-NEXT: .cfi_startproc
+# CHECK-NEXT: # %bb.0:
+# CHECK-NEXT: .file 1 "/" "test.c"
+# CHECK-NEXT: .loc 1 5 16 prologue_end # test.c:5:16
+# CHECK-NEXT: testq %rax, %rax
+# CHECK-NEXT: je .LBB0_2
+# CHECK-NEXT: # %bb.1:
+# CHECK-NEXT: .loc 1 5 16 # test.c:5:16
+# CHECK-NEXT: retq
+# CHECK-NEXT: .LBB0_2:
+# -- Check the .loc below sets the current location to line 0.
+# CHECK-NEXT: .loc 1 0 16 is_stmt 0 # test.c:0:16
+# CHECK-NEXT: pushq %rax
+# CHECK-NEXT: .cfi_def_cfa_offset 16
+# CHECK-NEXT: addq $8, %rsp
+# CHECK-NEXT: .cfi_def_cfa_offset 8
+# CHECK-NEXT: .loc 1 5 16 is_stmt 1 # test.c:5:16
+# CHECK-NEXT: retq
+
+--- |
+ source_filename = "reduced.ll"
+ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+ target triple = "x86_64-unknown-unknown"
+
+ define void @loop(i64 %i) !dbg !4 {
+ entry:
+ %cmp.not = icmp eq i64 %i, 0, !dbg !7
+ br i1 %cmp.not, label %for.body, label %for.end
+
+ for.body: ; preds = %entry
+ %puts10 = tail call i32 null(ptr null)
+ %inc = add i64 0, 0
+ br label %for.end
+
+ for.end: ; preds = %for.body, %entry
+ ret void
+ }
+
+ !llvm.dbg.cu = !{!0}
+ !llvm.module.flags = !{!3}
+
+ !0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 22.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !2, splitDebugInlining: false, nameTableKind: None)
+ !1 = !DIFile(filename: "test.c", directory: "/")
+ !2 = !{}
+ !3 = !{i32 2, !"Debug Info Version", i32 3}
+ !4 = distinct !DISubprogram(name: "loop", scope: !1, file: !1, line: 4, type: !5, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2, keyInstructions: true)
+ !5 = !DISubroutineType(types: !6)
+ !6 = !{null}
+ !7 = !DILocation(line: 5, column: 16, scope: !8, atomGroup: 720, atomRank: 2)
+ !8 = distinct !DILexicalBlock(scope: !4, file: !1, line: 5, column: 9)
+...
+---
+name: loop
+alignment: 16
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+debugInstrRef: true
+tracksDebugUserValues: true
+liveins:
+ - { reg: '$rdi' }
+frameInfo:
+ stackSize: 8
+ offsetAdjustment: -8
+ maxAlignment: 1
+ adjustsStack: true
+ hasCalls: true
+ maxCallFrameSize: 0
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ amxProgModel: None
+body: |
+ bb.0:
+ successors: %bb.1(0x30000000), %bb.2(0x50000000)
+ liveins: $rdi
+
+ TEST64rr undef renamable $rax, undef renamable $rax, implicit-def $eflags, debug-location !7
+ JCC_1 %bb.1, 4, implicit $eflags
+
+ bb.2:
+ RET64 debug-location !7
+
+ bb.1:
+ frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp
+ frame-setup CFI_INSTRUCTION def_cfa_offset 16
+ $rsp = frame-destroy ADD64ri32 $rsp, 8, implicit-def dead $eflags
+ frame-destroy CFI_INSTRUCTION def_cfa_offset 8
+ RET64 debug-location !7
+...
diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-win-dont-instrument-catchpad.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-win-dont-instrument-catchpad.ll
new file mode 100644
index 0000000..e38da0b
--- /dev/null
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-win-dont-instrument-catchpad.ll
@@ -0,0 +1,63 @@
+; RUN: opt < %s -passes=asan -S | FileCheck %s
+; CHECK: %ex = alloca i32, align 4
+; CHECK: catchpad within %{{.*}} [ptr @"??_R0H@8", i32 0, ptr %ex]
+
+; This test ensures that catch parameters are not instrumented on Windows.
+
+; This file was generated using the following source
+;
+; ```C++
+; #include <exception>
+; #include <cstdio>
+;
+; int main() {
+; try {
+; throw 1;
+; } catch (const int ex) {
+; printf("%d\n", ex);
+; return -1;
+; }
+; return 0;
+; }
+;
+; ```
+; then running the following sequence of commands
+;
+; ```
+; clang.exe -g0 -O0 -emit-llvm -c main.cpp -o main.bc
+; llvm-extract.exe -func=main main.bc -o main_func.bc
+; llvm-dis.exe main_func.bc -o main_func_dis.ll
+; ```
+; and finally manually trimming the resulting `.ll` file to remove
+; unnecessary metadata, and manually adding the `sanitize_address` annotation;
+; needed for the ASan pass to run.
+
+target triple = "x86_64-pc-windows-msvc"
+
+@"??_R0H@8" = external global ptr
+
+; Function Attrs: sanitize_address
+define i32 @main() sanitize_address personality ptr @__CxxFrameHandler3 {
+entry:
+ %ex = alloca i32, align 4
+ invoke void @throw()
+ to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch: ; preds = %entry
+ %0 = catchswitch within none [label %catch] unwind to caller
+
+catch: ; preds = %catch.dispatch
+ %1 = catchpad within %0 [ptr @"??_R0H@8", i32 0, ptr %ex]
+ call void @opaque() [ "funclet"(token %1) ]
+ catchret from %1 to label %return
+
+return: ; preds = %catch
+ ret i32 0
+
+unreachable: ; preds = %entry
+ unreachable
+}
+
+declare void @throw() noreturn
+declare void @opaque()
+declare i32 @__CxxFrameHandler3(...)
diff --git a/llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll b/llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll
index 3ed68e8..c3a75f6 100644
--- a/llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll
+++ b/llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll
@@ -13,14 +13,14 @@
; RUN: -r=%t.o,main,plx \
; RUN: -r=%t.o,_Znam, \
; RUN: -memprof-dump-ccg \
-; RUN: -save-temps \
-; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
-; DUMP: Callsite Context Graph:
+; RUN: -print-before=memprof-context-disambiguation \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR
-; RUN: llvm-dis %t.out.0.0.preopt.bc -o - | FileCheck %s --check-prefix=IR
; IR: !memprof {{.*}} !callsite
; IR: "memprof"="cold"
+; DUMP: Callsite Context Graph:
+
;; Next check without -supports-hot-cold-new, we should not perform
;; context disambiguation, and we should strip memprof metadata and
;; attributes before optimization.
@@ -28,13 +28,16 @@
; RUN: -r=%t.o,main,plx \
; RUN: -r=%t.o,_Znam, \
; RUN: -memprof-dump-ccg \
-; RUN: -save-temps \
+; RUN: -print-before=memprof-context-disambiguation \
; RUN: -o %t.out 2>&1 | FileCheck %s --allow-empty \
-; RUN: --implicit-check-not "Callsite Context Graph:"
+; RUN: --implicit-check-not "Callsite Context Graph:" \
+; RUN: --implicit-check-not "!memprof" --implicit-check-not "!callsite" \
+; RUN: --implicit-check-not "memprof"="cold"
-; RUN: llvm-dis %t.out.0.0.preopt.bc -o - | FileCheck %s \
-; RUN: --implicit-check-not "!memprof" --implicit-check-not "!callsite" \
-; RUN: --implicit-check-not "memprof"="cold"
+;; Ensure the attributes and metadata are stripped when running a non-LTO pipeline.
+; RUN: opt -O3 %t.o -S | FileCheck %s \
+; RUN: --implicit-check-not "!memprof" --implicit-check-not "!callsite" \
+; RUN: --implicit-check-not "memprof"="cold"
source_filename = "memprof-supports-hot-cold-new.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vds_alias.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vds_alias.s
index 5b6bb47..83313a2 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vds_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vds_alias.s
@@ -5,3 +5,15 @@ ds_load_tr_b64 v[2:3], v0
ds_load_tr_b128 v[2:5], v0
// GFX1250: ds_load_tr16_b128 v[2:5], v0 ; encoding: [0x00,0x00,0xf0,0xdb,0x00,0x00,0x00,0x02]
+
+ds_load_b128_tr_b16 v[2:5], v0
+// GFX1250: ds_load_tr16_b128 v[2:5], v0 ; encoding: [0x00,0x00,0xf0,0xdb,0x00,0x00,0x00,0x02]
+
+ds_load_b64_tr_b8 v[2:3], v0
+// GFX1250: ds_load_tr8_b64 v[2:3], v0 ; encoding: [0x00,0x00,0xf4,0xdb,0x00,0x00,0x00,0x02]
+
+ds_load_b64_tr_b4 v[2:3], v0
+// GFX1250: ds_load_tr4_b64 v[2:3], v0 ; encoding: [0x00,0x00,0xe8,0xdb,0x00,0x00,0x00,0x02]
+
+ds_load_tr6_b96 v[2:4], v0
+// GFX1250: ds_load_tr6_b96 v[2:4], v0 ; encoding: [0x00,0x00,0xec,0xdb,0x00,0x00,0x00,0x02]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_alias.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_alias.s
index 6b2dd67..f983bc0 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_alias.s
@@ -35,3 +35,78 @@ global_load_tr_b128 v[2:5], v[6:7], off offset:64
global_load_tr_b128 v[2:5], v[6:7], off offset:-64
// GFX1250: global_load_tr16_b128 v[2:5], v[6:7], off offset:-64 ; encoding: [0x7c,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x06,0xc0,0xff,0xff]
+
+global_load_b64_tr_b8 v[2:3], v0, s[0:1]
+// GFX1250: global_load_tr8_b64 v[2:3], v0, s[0:1] ; encoding: [0x00,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_load_b64_tr_b8 v[2:3], v0, s[0:1] offset:64
+// GFX1250: global_load_tr8_b64 v[2:3], v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_b64_tr_b8 v[2:3], v0, s[0:1] offset:-64
+// GFX1250: global_load_tr8_b64 v[2:3], v0, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_b64_tr_b8 v[2:3], v[4:5], off
+// GFX1250: global_load_tr8_b64 v[2:3], v[4:5], off ; encoding: [0x7c,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+
+global_load_b64_tr_b8 v[2:3], v[4:5], off offset:64
+// GFX1250: global_load_tr8_b64 v[2:3], v[4:5], off offset:64 ; encoding: [0x7c,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
+
+global_load_b64_tr_b8 v[2:3], v[4:5], off offset:-64
+// GFX1250: global_load_tr8_b64 v[2:3], v[4:5], off offset:-64 ; encoding: [0x7c,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x04,0xc0,0xff,0xff]
+
+global_load_b128_tr_b16 v[2:5], v0, s[0:1]
+// GFX1250: global_load_tr16_b128 v[2:5], v0, s[0:1] ; encoding: [0x00,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_load_b128_tr_b16 v[2:5], v0, s[0:1] offset:64
+// GFX1250: global_load_tr16_b128 v[2:5], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_b128_tr_b16 v[2:5], v0, s[0:1] offset:-64
+// GFX1250: global_load_tr16_b128 v[2:5], v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_b128_tr_b16 v[2:5], v[6:7], off
+// GFX1250: global_load_tr16_b128 v[2:5], v[6:7], off ; encoding: [0x7c,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00]
+
+global_load_b128_tr_b16 v[2:5], v[6:7], off offset:64
+// GFX1250: global_load_tr16_b128 v[2:5], v[6:7], off offset:64 ; encoding: [0x7c,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x06,0x40,0x00,0x00]
+
+global_load_b128_tr_b16 v[2:5], v[6:7], off offset:-64
+// GFX1250: global_load_tr16_b128 v[2:5], v[6:7], off offset:-64 ; encoding: [0x7c,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x06,0xc0,0xff,0xff]
+
+global_load_b64_tr_b4 v[2:3], v0, s[0:1]
+// GFX1250: global_load_tr4_b64 v[2:3], v0, s[0:1] ; encoding: [0x00,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_load_b64_tr_b4 v[2:3], v0, s[0:1] offset:64
+// GFX1250: global_load_tr4_b64 v[2:3], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_b64_tr_b4 v[2:3], v0, s[0:1] offset:-64
+// GFX1250: global_load_tr4_b64 v[2:3], v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_b64_tr_b4 v[2:3], v[4:5], off
+// GFX1250: global_load_tr4_b64 v[2:3], v[4:5], off ; encoding: [0x7c,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+
+global_load_b64_tr_b4 v[2:3], v[4:5], off offset:64
+// GFX1250: global_load_tr4_b64 v[2:3], v[4:5], off offset:64 ; encoding: [0x7c,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
+
+global_load_b64_tr_b4 v[2:3], v[4:5], off offset:-64
+// GFX1250: global_load_tr4_b64 v[2:3], v[4:5], off offset:-64 ; encoding: [0x7c,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x04,0xc0,0xff,0xff]
+
+global_load_b96_tr_b6 v[2:4], v0, s[0:1]
+// GFX1250: global_load_tr6_b96 v[2:4], v0, s[0:1] ; encoding: [0x00,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_load_b96_tr_b6 v[3:5], v0, s[0:1]
+// GFX1250: global_load_tr6_b96 v[3:5], v0, s[0:1] ; encoding: [0x00,0x00,0x1d,0xee,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_load_b96_tr_b6 v[2:4], v0, s[0:1] offset:64
+// GFX1250: global_load_tr6_b96 v[2:4], v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_b96_tr_b6 v[2:4], v0, s[0:1] offset:-64
+// GFX1250: global_load_tr6_b96 v[2:4], v0, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_b96_tr_b6 v[2:4], v[6:7], off
+// GFX1250: global_load_tr6_b96 v[2:4], v[6:7], off ; encoding: [0x7c,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00]
+
+global_load_b96_tr_b6 v[2:4], v[6:7], off offset:64
+// GFX1250: global_load_tr6_b96 v[2:4], v[6:7], off offset:64 ; encoding: [0x7c,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x06,0x40,0x00,0x00]
+
+global_load_b96_tr_b6 v[2:4], v[6:7], off offset:-64
+// GFX1250: global_load_tr6_b96 v[2:4], v[6:7], off offset:-64 ; encoding: [0x7c,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x06,0xc0,0xff,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx8_vop3cx_nowarn.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx8_vop3cx_nowarn.txt
new file mode 100644
index 0000000..d4888ad
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx8_vop3cx_nowarn.txt
@@ -0,0 +1,422 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=tonga -disassemble -show-encoding < %s | FileCheck -strict-whitespace %s
+
+# In GFX10+, v_cmpx_* use EXEC as the implicit dst. The disassembler issues a warning when the dst
+# is not 0x7e (EXEC). In GFX9 and earlier, these instructions have explicit dst. Therefore, such
+# warnings should not be issued.
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 flat_scratch, v1, v2 ; encoding: [0x66,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x66,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 vcc, v1, v2 ; encoding: [0x6a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x6a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 tba, v1, v2 ; encoding: [0x6c,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x6c,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 tma, v1, v2 ; encoding: [0x6e,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x6e,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 ttmp[10:11], v1, v2 ; encoding: [0x7a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x7a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f64_e64 s[0:1], v[1:2], v2 ; encoding: [0x00,0x00,0x13,0xd0,0x01,0x05,0x02,0x00]
+0x00,0x00,0x13,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f16_e64 s[2:3], v1, v2 ; encoding: [0x02,0x00,0x15,0xd0,0x01,0x05,0x02,0x00]
+0x02,0x00,0x15,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f16_e64 s[4:5], v1, v2 ; encoding: [0x04,0x00,0x30,0xd0,0x01,0x05,0x02,0x00]
+0x04,0x00,0x30,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_f16_e64 s[6:7], v1, v2 ; encoding: [0x06,0x00,0x31,0xd0,0x01,0x05,0x02,0x00]
+0x06,0x00,0x31,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f16_e64 s[8:9], v1, v2 ; encoding: [0x08,0x00,0x32,0xd0,0x01,0x05,0x02,0x00]
+0x08,0x00,0x32,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f16_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0x33,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0x33,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f16_e64 s[12:13], v1, v2 ; encoding: [0x0c,0x00,0x34,0xd0,0x01,0x05,0x02,0x00]
+0x0c,0x00,0x34,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f16_e64 s[14:15], v1, v2 ; encoding: [0x0e,0x00,0x35,0xd0,0x01,0x05,0x02,0x00]
+0x0e,0x00,0x35,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f16_e64 s[16:17], v1, v2 ; encoding: [0x10,0x00,0x36,0xd0,0x01,0x05,0x02,0x00]
+0x10,0x00,0x36,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f16_e64 s[18:19], v1, v2 ; encoding: [0x12,0x00,0x37,0xd0,0x01,0x05,0x02,0x00]
+0x12,0x00,0x37,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f16_e64 s[20:21], v1, v2 ; encoding: [0x14,0x00,0x38,0xd0,0x01,0x05,0x02,0x00]
+0x14,0x00,0x38,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f16_e64 s[22:23], v1, v2 ; encoding: [0x16,0x00,0x39,0xd0,0x01,0x05,0x02,0x00]
+0x16,0x00,0x39,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f16_e64 s[24:25], v1, v2 ; encoding: [0x18,0x00,0x3a,0xd0,0x01,0x05,0x02,0x00]
+0x18,0x00,0x3a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f16_e64 s[26:27], v1, v2 ; encoding: [0x1a,0x00,0x3b,0xd0,0x01,0x05,0x02,0x00]
+0x1a,0x00,0x3b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f16_e64 s[28:29], v1, v2 ; encoding: [0x1c,0x00,0x3c,0xd0,0x01,0x05,0x02,0x00]
+0x1c,0x00,0x3c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f16_e64 s[30:31], v1, v2 ; encoding: [0x1e,0x00,0x3d,0xd0,0x01,0x05,0x02,0x00]
+0x1e,0x00,0x3d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f16_e64 s[32:33], v1, v2 ; encoding: [0x20,0x00,0x3e,0xd0,0x01,0x05,0x02,0x00]
+0x20,0x00,0x3e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_tru_f16_e64 s[34:35], v1, v2 ; encoding: [0x22,0x00,0x3f,0xd0,0x01,0x05,0x02,0x00]
+0x22,0x00,0x3f,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f32_e64 s[36:37], v1, v2 ; encoding: [0x24,0x00,0x50,0xd0,0x01,0x05,0x02,0x00]
+0x24,0x00,0x50,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_f32_e64 s[38:39], v1, v2 ; encoding: [0x26,0x00,0x51,0xd0,0x01,0x05,0x02,0x00]
+0x26,0x00,0x51,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f32_e64 s[40:41], v1, v2 ; encoding: [0x28,0x00,0x52,0xd0,0x01,0x05,0x02,0x00]
+0x28,0x00,0x52,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f32_e64 s[42:43], v1, v2 ; encoding: [0x2a,0x00,0x53,0xd0,0x01,0x05,0x02,0x00]
+0x2a,0x00,0x53,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f32_e64 s[44:45], v1, v2 ; encoding: [0x2c,0x00,0x54,0xd0,0x01,0x05,0x02,0x00]
+0x2c,0x00,0x54,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f32_e64 s[46:47], v1, v2 ; encoding: [0x2e,0x00,0x55,0xd0,0x01,0x05,0x02,0x00]
+0x2e,0x00,0x55,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f32_e64 s[48:49], v1, v2 ; encoding: [0x30,0x00,0x56,0xd0,0x01,0x05,0x02,0x00]
+0x30,0x00,0x56,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f32_e64 s[50:51], v1, v2 ; encoding: [0x32,0x00,0x57,0xd0,0x01,0x05,0x02,0x00]
+0x32,0x00,0x57,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f32_e64 s[52:53], v1, v2 ; encoding: [0x34,0x00,0x58,0xd0,0x01,0x05,0x02,0x00]
+0x34,0x00,0x58,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f32_e64 s[54:55], v1, v2 ; encoding: [0x36,0x00,0x59,0xd0,0x01,0x05,0x02,0x00]
+0x36,0x00,0x59,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f32_e64 s[56:57], v1, v2 ; encoding: [0x38,0x00,0x5a,0xd0,0x01,0x05,0x02,0x00]
+0x38,0x00,0x5a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f32_e64 s[58:59], v1, v2 ; encoding: [0x3a,0x00,0x5b,0xd0,0x01,0x05,0x02,0x00]
+0x3a,0x00,0x5b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f32_e64 s[60:61], v1, v2 ; encoding: [0x3c,0x00,0x5c,0xd0,0x01,0x05,0x02,0x00]
+0x3c,0x00,0x5c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f32_e64 s[62:63], v1, v2 ; encoding: [0x3e,0x00,0x5d,0xd0,0x01,0x05,0x02,0x00]
+0x3e,0x00,0x5d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f32_e64 s[64:65], v1, v2 ; encoding: [0x40,0x00,0x5e,0xd0,0x01,0x05,0x02,0x00]
+0x40,0x00,0x5e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_tru_f32_e64 s[66:67], v1, v2 ; encoding: [0x42,0x00,0x5f,0xd0,0x01,0x05,0x02,0x00]
+0x42,0x00,0x5f,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f64_e64 s[68:69], v[1:2], v[2:3] ; encoding: [0x44,0x00,0x70,0xd0,0x01,0x05,0x02,0x00]
+0x44,0x00,0x70,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_f64_e64 s[70:71], v[1:2], v[2:3] ; encoding: [0x46,0x00,0x71,0xd0,0x01,0x05,0x02,0x00]
+0x46,0x00,0x71,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f64_e64 s[72:73], v[1:2], v[2:3] ; encoding: [0x48,0x00,0x72,0xd0,0x01,0x05,0x02,0x00]
+0x48,0x00,0x72,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f64_e64 s[74:75], v[1:2], v[2:3] ; encoding: [0x4a,0x00,0x73,0xd0,0x01,0x05,0x02,0x00]
+0x4a,0x00,0x73,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f64_e64 s[76:77], v[1:2], v[2:3] ; encoding: [0x4c,0x00,0x74,0xd0,0x01,0x05,0x02,0x00]
+0x4c,0x00,0x74,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f64_e64 s[78:79], v[1:2], v[2:3] ; encoding: [0x4e,0x00,0x75,0xd0,0x01,0x05,0x02,0x00]
+0x4e,0x00,0x75,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f64_e64 s[80:81], v[1:2], v[2:3] ; encoding: [0x50,0x00,0x76,0xd0,0x01,0x05,0x02,0x00]
+0x50,0x00,0x76,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f64_e64 s[82:83], v[1:2], v[2:3] ; encoding: [0x52,0x00,0x77,0xd0,0x01,0x05,0x02,0x00]
+0x52,0x00,0x77,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f64_e64 s[84:85], v[1:2], v[2:3] ; encoding: [0x54,0x00,0x78,0xd0,0x01,0x05,0x02,0x00]
+0x54,0x00,0x78,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f64_e64 s[86:87], v[1:2], v[2:3] ; encoding: [0x56,0x00,0x79,0xd0,0x01,0x05,0x02,0x00]
+0x56,0x00,0x79,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f64_e64 s[88:89], v[1:2], v[2:3] ; encoding: [0x58,0x00,0x7a,0xd0,0x01,0x05,0x02,0x00]
+0x58,0x00,0x7a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f64_e64 s[90:91], v[1:2], v[2:3] ; encoding: [0x5a,0x00,0x7b,0xd0,0x01,0x05,0x02,0x00]
+0x5a,0x00,0x7b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f64_e64 s[92:93], v[1:2], v[2:3] ; encoding: [0x5c,0x00,0x7c,0xd0,0x01,0x05,0x02,0x00]
+0x5c,0x00,0x7c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f64_e64 s[94:95], v[1:2], v[2:3] ; encoding: [0x5e,0x00,0x7d,0xd0,0x01,0x05,0x02,0x00]
+0x5e,0x00,0x7d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f64_e64 s[96:97], v[1:2], v[2:3] ; encoding: [0x60,0x00,0x7e,0xd0,0x01,0x05,0x02,0x00]
+0x60,0x00,0x7e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_tru_f64_e64 s[98:99], v[1:2], v[2:3] ; encoding: [0x62,0x00,0x7f,0xd0,0x01,0x05,0x02,0x00]
+0x62,0x00,0x7f,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i16_e64 s[100:101], v1, v2 ; encoding: [0x64,0x00,0xb0,0xd0,0x01,0x05,0x02,0x00]
+0x64,0x00,0xb0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i16_e64 flat_scratch, v1, v2 ; encoding: [0x66,0x00,0xb1,0xd0,0x01,0x05,0x02,0x00]
+0x66,0x00,0xb1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i16_e64 xnack_mask, v1, v2 ; encoding: [0x68,0x00,0xb2,0xd0,0x01,0x05,0x02,0x00]
+0x68,0x00,0xb2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i16_e64 vcc, v1, v2 ; encoding: [0x6a,0x00,0xb3,0xd0,0x01,0x05,0x02,0x00]
+0x6a,0x00,0xb3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i16_e64 tba, v1, v2 ; encoding: [0x6c,0x00,0xb4,0xd0,0x01,0x05,0x02,0x00]
+0x6c,0x00,0xb4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i16_e64 tma, v1, v2 ; encoding: [0x6e,0x00,0xb5,0xd0,0x01,0x05,0x02,0x00]
+0x6e,0x00,0xb5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i16_e64 ttmp[0:1], v1, v2 ; encoding: [0x70,0x00,0xb6,0xd0,0x01,0x05,0x02,0x00]
+0x70,0x00,0xb6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i16_e64 ttmp[2:3], v1, v2 ; encoding: [0x72,0x00,0xb7,0xd0,0x01,0x05,0x02,0x00]
+0x72,0x00,0xb7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u16_e64 ttmp[4:5], v1, v2 ; encoding: [0x74,0x00,0xb8,0xd0,0x01,0x05,0x02,0x00]
+0x74,0x00,0xb8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u16_e64 ttmp[6:7], v1, v2 ; encoding: [0x76,0x00,0xb9,0xd0,0x01,0x05,0x02,0x00]
+0x76,0x00,0xb9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u16_e64 ttmp[8:9], v1, v2 ; encoding: [0x78,0x00,0xba,0xd0,0x01,0x05,0x02,0x00]
+0x78,0x00,0xba,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u16_e64 ttmp[10:11], v1, v2 ; encoding: [0x7a,0x00,0xbb,0xd0,0x01,0x05,0x02,0x00]
+0x7a,0x00,0xbb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u16_e64 exec, v1, v2 ; encoding: [0x7e,0x00,0xbc,0xd0,0x01,0x05,0x02,0x00]
+0x7e,0x00,0xbc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u16_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xbd,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xbd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u16_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xbe,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xbe,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u16_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xbf,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xbf,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd0,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd1,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd2,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd3,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd4,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd5,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd6,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd7,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd8,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd9,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xda,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xda,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xdb,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xdc,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xdd,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xde,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xde,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xdf,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdf,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf0,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf1,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf2,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf3,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf4,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf5,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf6,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf7,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf8,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf9,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfa,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfa,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfb,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfc,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfd,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfe,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfe,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xff,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xff,0xd0,0x01,0x05,0x02,0x00
+
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3c_nowarn.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3c_nowarn.txt
new file mode 100644
index 0000000..0c4f107
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3c_nowarn.txt
@@ -0,0 +1,402 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx900 -disassemble -show-encoding < %s 2>&1 | FileCheck -strict-whitespace %s
+
+# In GFX10+, v_cmpx_* use EXEC as the implicit dst. The disassembler issues a warning when the dst
+# is not 0x7e (EXEC). In GFX9 and earlier, these instructions have explicit dst. Therefore, such
+# warnings should not be issued.
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmp_class_f32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0x10,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0x10,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmp_class_f32_e64 flat_scratch, v1, v2 ; encoding: [0x66,0x00,0x10,0xd0,0x01,0x05,0x02,0x00]
+0x66,0x00,0x10,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmp_class_f32_e64 vcc, v1, v2 ; encoding: [0x6a,0x00,0x10,0xd0,0x01,0x05,0x02,0x00]
+0x6a,0x00,0x10,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f64_e64 s[0:1], v[1:2], v2 ; encoding: [0x00,0x00,0x13,0xd0,0x01,0x05,0x02,0x00]
+0x00,0x00,0x13,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f16_e64 s[2:3], v1, v2 ; encoding: [0x02,0x00,0x15,0xd0,0x01,0x05,0x02,0x00]
+0x02,0x00,0x15,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f16_e64 s[4:5], v1, v2 ; encoding: [0x04,0x00,0x30,0xd0,0x01,0x05,0x02,0x00]
+0x04,0x00,0x30,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_f16_e64 s[6:7], v1, v2 ; encoding: [0x06,0x00,0x31,0xd0,0x01,0x05,0x02,0x00]
+0x06,0x00,0x31,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f16_e64 s[8:9], v1, v2 ; encoding: [0x08,0x00,0x32,0xd0,0x01,0x05,0x02,0x00]
+0x08,0x00,0x32,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f16_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0x33,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0x33,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f16_e64 s[12:13], v1, v2 ; encoding: [0x0c,0x00,0x34,0xd0,0x01,0x05,0x02,0x00]
+0x0c,0x00,0x34,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f16_e64 s[14:15], v1, v2 ; encoding: [0x0e,0x00,0x35,0xd0,0x01,0x05,0x02,0x00]
+0x0e,0x00,0x35,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f16_e64 s[16:17], v1, v2 ; encoding: [0x10,0x00,0x36,0xd0,0x01,0x05,0x02,0x00]
+0x10,0x00,0x36,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f16_e64 s[18:19], v1, v2 ; encoding: [0x12,0x00,0x37,0xd0,0x01,0x05,0x02,0x00]
+0x12,0x00,0x37,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f16_e64 s[20:21], v1, v2 ; encoding: [0x14,0x00,0x38,0xd0,0x01,0x05,0x02,0x00]
+0x14,0x00,0x38,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f16_e64 s[22:23], v1, v2 ; encoding: [0x16,0x00,0x39,0xd0,0x01,0x05,0x02,0x00]
+0x16,0x00,0x39,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f16_e64 s[24:25], v1, v2 ; encoding: [0x18,0x00,0x3a,0xd0,0x01,0x05,0x02,0x00]
+0x18,0x00,0x3a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f16_e64 s[26:27], v1, v2 ; encoding: [0x1a,0x00,0x3b,0xd0,0x01,0x05,0x02,0x00]
+0x1a,0x00,0x3b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f16_e64 s[28:29], v1, v2 ; encoding: [0x1c,0x00,0x3c,0xd0,0x01,0x05,0x02,0x00]
+0x1c,0x00,0x3c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f16_e64 s[30:31], v1, v2 ; encoding: [0x1e,0x00,0x3d,0xd0,0x01,0x05,0x02,0x00]
+0x1e,0x00,0x3d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f16_e64 s[32:33], v1, v2 ; encoding: [0x20,0x00,0x3e,0xd0,0x01,0x05,0x02,0x00]
+0x20,0x00,0x3e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_tru_f16_e64 s[34:35], v1, v2 ; encoding: [0x22,0x00,0x3f,0xd0,0x01,0x05,0x02,0x00]
+0x22,0x00,0x3f,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f32_e64 s[36:37], v1, v2 ; encoding: [0x24,0x00,0x50,0xd0,0x01,0x05,0x02,0x00]
+0x24,0x00,0x50,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_f32_e64 s[38:39], v1, v2 ; encoding: [0x26,0x00,0x51,0xd0,0x01,0x05,0x02,0x00]
+0x26,0x00,0x51,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f32_e64 s[40:41], v1, v2 ; encoding: [0x28,0x00,0x52,0xd0,0x01,0x05,0x02,0x00]
+0x28,0x00,0x52,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f32_e64 s[42:43], v1, v2 ; encoding: [0x2a,0x00,0x53,0xd0,0x01,0x05,0x02,0x00]
+0x2a,0x00,0x53,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f32_e64 s[44:45], v1, v2 ; encoding: [0x2c,0x00,0x54,0xd0,0x01,0x05,0x02,0x00]
+0x2c,0x00,0x54,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f32_e64 s[46:47], v1, v2 ; encoding: [0x2e,0x00,0x55,0xd0,0x01,0x05,0x02,0x00]
+0x2e,0x00,0x55,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f32_e64 s[48:49], v1, v2 ; encoding: [0x30,0x00,0x56,0xd0,0x01,0x05,0x02,0x00]
+0x30,0x00,0x56,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f32_e64 s[50:51], v1, v2 ; encoding: [0x32,0x00,0x57,0xd0,0x01,0x05,0x02,0x00]
+0x32,0x00,0x57,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f32_e64 s[52:53], v1, v2 ; encoding: [0x34,0x00,0x58,0xd0,0x01,0x05,0x02,0x00]
+0x34,0x00,0x58,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f32_e64 s[54:55], v1, v2 ; encoding: [0x36,0x00,0x59,0xd0,0x01,0x05,0x02,0x00]
+0x36,0x00,0x59,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f32_e64 s[56:57], v1, v2 ; encoding: [0x38,0x00,0x5a,0xd0,0x01,0x05,0x02,0x00]
+0x38,0x00,0x5a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f32_e64 s[58:59], v1, v2 ; encoding: [0x3a,0x00,0x5b,0xd0,0x01,0x05,0x02,0x00]
+0x3a,0x00,0x5b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f32_e64 s[60:61], v1, v2 ; encoding: [0x3c,0x00,0x5c,0xd0,0x01,0x05,0x02,0x00]
+0x3c,0x00,0x5c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f32_e64 s[62:63], v1, v2 ; encoding: [0x3e,0x00,0x5d,0xd0,0x01,0x05,0x02,0x00]
+0x3e,0x00,0x5d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f32_e64 s[64:65], v1, v2 ; encoding: [0x40,0x00,0x5e,0xd0,0x01,0x05,0x02,0x00]
+0x40,0x00,0x5e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f64_e64 s[66:67], v[1:2], v[2:3] ; encoding: [0x42,0x00,0x70,0xd0,0x01,0x05,0x02,0x00]
+0x42,0x00,0x70,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f64_e64 s[68:69], v[1:2], v[2:3] ; encoding: [0x44,0x00,0x72,0xd0,0x01,0x05,0x02,0x00]
+0x44,0x00,0x72,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f64_e64 s[70:71], v[1:2], v[2:3] ; encoding: [0x46,0x00,0x73,0xd0,0x01,0x05,0x02,0x00]
+0x46,0x00,0x73,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f64_e64 s[72:73], v[1:2], v[2:3] ; encoding: [0x48,0x00,0x74,0xd0,0x01,0x05,0x02,0x00]
+0x48,0x00,0x74,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f64_e64 s[74:75], v[1:2], v[2:3] ; encoding: [0x4a,0x00,0x75,0xd0,0x01,0x05,0x02,0x00]
+0x4a,0x00,0x75,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f64_e64 s[76:77], v[1:2], v[2:3] ; encoding: [0x4c,0x00,0x76,0xd0,0x01,0x05,0x02,0x00]
+0x4c,0x00,0x76,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f64_e64 s[78:79], v[1:2], v[2:3] ; encoding: [0x4e,0x00,0x77,0xd0,0x01,0x05,0x02,0x00]
+0x4e,0x00,0x77,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f64_e64 s[80:81], v[1:2], v[2:3] ; encoding: [0x50,0x00,0x78,0xd0,0x01,0x05,0x02,0x00]
+0x50,0x00,0x78,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f64_e64 s[82:83], v[1:2], v[2:3] ; encoding: [0x52,0x00,0x79,0xd0,0x01,0x05,0x02,0x00]
+0x52,0x00,0x79,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f64_e64 s[84:85], v[1:2], v[2:3] ; encoding: [0x54,0x00,0x7a,0xd0,0x01,0x05,0x02,0x00]
+0x54,0x00,0x7a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f64_e64 s[86:87], v[1:2], v[2:3] ; encoding: [0x56,0x00,0x7b,0xd0,0x01,0x05,0x02,0x00]
+0x56,0x00,0x7b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f64_e64 s[88:89], v[1:2], v[2:3] ; encoding: [0x58,0x00,0x7c,0xd0,0x01,0x05,0x02,0x00]
+0x58,0x00,0x7c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f64_e64 s[90:91], v[1:2], v[2:3] ; encoding: [0x5a,0x00,0x7d,0xd0,0x01,0x05,0x02,0x00]
+0x5a,0x00,0x7d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f64_e64 s[92:93], v[1:2], v[2:3] ; encoding: [0x5c,0x00,0x7e,0xd0,0x01,0x05,0x02,0x00]
+0x5c,0x00,0x7e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_tru_f64_e64 s[94:95], v[1:2], v[2:3] ; encoding: [0x5e,0x00,0x7f,0xd0,0x01,0x05,0x02,0x00]
+0x5e,0x00,0x7f,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i16_e64 s[96:97], v1, v2 ; encoding: [0x60,0x00,0xb0,0xd0,0x01,0x05,0x02,0x00]
+0x60,0x00,0xb0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i16_e64 s[98:99], v1, v2 ; encoding: [0x62,0x00,0xb1,0xd0,0x01,0x05,0x02,0x00]
+0x62,0x00,0xb1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i16_e64 s[100:101], v1, v2 ; encoding: [0x64,0x00,0xb2,0xd0,0x01,0x05,0x02,0x00]
+0x64,0x00,0xb2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i16_e64 flat_scratch, v1, v2 ; encoding: [0x66,0x00,0xb3,0xd0,0x01,0x05,0x02,0x00]
+0x66,0x00,0xb3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i16_e64 xnack_mask, v1, v2 ; encoding: [0x68,0x00,0xb4,0xd0,0x01,0x05,0x02,0x00]
+0x68,0x00,0xb4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i16_e64 vcc, v1, v2 ; encoding: [0x6a,0x00,0xb5,0xd0,0x01,0x05,0x02,0x00]
+0x6a,0x00,0xb5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i16_e64 ttmp[0:1], v1, v2 ; encoding: [0x6c,0x00,0xb6,0xd0,0x01,0x05,0x02,0x00]
+0x6c,0x00,0xb6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i16_e64 ttmp[2:3], v1, v2 ; encoding: [0x6e,0x00,0xb7,0xd0,0x01,0x05,0x02,0x00]
+0x6e,0x00,0xb7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u16_e64 ttmp[4:5], v1, v2 ; encoding: [0x70,0x00,0xb8,0xd0,0x01,0x05,0x02,0x00]
+0x70,0x00,0xb8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u16_e64 ttmp[6:7], v1, v2 ; encoding: [0x72,0x00,0xb9,0xd0,0x01,0x05,0x02,0x00]
+0x72,0x00,0xb9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u16_e64 ttmp[8:9], v1, v2 ; encoding: [0x74,0x00,0xba,0xd0,0x01,0x05,0x02,0x00]
+0x74,0x00,0xba,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u16_e64 ttmp[10:11], v1, v2 ; encoding: [0x76,0x00,0xbb,0xd0,0x01,0x05,0x02,0x00]
+0x76,0x00,0xbb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u16_e64 ttmp[12:13], v1, v2 ; encoding: [0x78,0x00,0xbc,0xd0,0x01,0x05,0x02,0x00]
+0x78,0x00,0xbc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u16_e64 ttmp[14:15], v1, v2 ; encoding: [0x7a,0x00,0xbd,0xd0,0x01,0x05,0x02,0x00]
+0x7a,0x00,0xbd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u16_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xbe,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xbe,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u16_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xbf,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xbf,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd0,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd1,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd2,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd3,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd4,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd5,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd6,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd7,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd8,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd9,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xda,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xda,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xdb,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xdc,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xdd,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xde,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xde,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xdf,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdf,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf0,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf1,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf2,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf3,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf4,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf5,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf6,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf7,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf8,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf9,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfa,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfa,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfb,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfc,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfd,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfe,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfe,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xff,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xff,0xd0,0x01,0x05,0x02,0x00
+
diff --git a/llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt b/llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt
index 4ec534f..fa40fe6 100644
--- a/llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt
@@ -17,6 +17,10 @@
# INTEL: pushp r16
0xd5,0x18,0x50
+# ATT: pushq %r16
+# INTEL: push r16
+0xd5,0x10,0x50
+
# ATT: popp %rax
# INTEL: popp rax
0xd5,0x08,0x58
@@ -32,3 +36,7 @@
# ATT: popp %r16
# INTEL: popp r16
0xd5,0x18,0x58
+
+# ATT: popq %r16
+# INTEL: pop r16
+0xd5,0x10,0x58
diff --git a/llvm/test/MC/X86/apx/pushp-popp-att.s b/llvm/test/MC/X86/apx/pushp-popp-att.s
index a810744..d638034 100644
--- a/llvm/test/MC/X86/apx/pushp-popp-att.s
+++ b/llvm/test/MC/X86/apx/pushp-popp-att.s
@@ -1,7 +1,7 @@
# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
-# ERROR-COUNT-8: error:
+# ERROR-COUNT-10: error:
# ERROR-NOT: error:
# CHECK: pushp %rax
@@ -16,6 +16,9 @@
# CHECK: pushp %r16
# CHECK: encoding: [0xd5,0x18,0x50]
pushp %r16
+# CHECK: pushq %r16
+# CHECK: encoding: [0xd5,0x10,0x50]
+ pushq %r16
# CHECK: popp %rax
# CHECK: encoding: [0xd5,0x08,0x58]
@@ -29,3 +32,6 @@
# CHECK: popp %r16
# CHECK: encoding: [0xd5,0x18,0x58]
popp %r16
+# CHECK: popq %r16
+# CHECK: encoding: [0xd5,0x10,0x58]
+ popq %r16
diff --git a/llvm/test/Other/loop-pm-invalidation.ll b/llvm/test/Other/loop-pm-invalidation.ll
index 4bead0b..25552f7 100644
--- a/llvm/test/Other/loop-pm-invalidation.ll
+++ b/llvm/test/Other/loop-pm-invalidation.ll
@@ -16,11 +16,6 @@
; RUN: opt -disable-output -disable-verify -verify-analysis-invalidation=0 -debug-pass-manager %s -aa-pipeline= 2>&1 \
; RUN: -passes='loop(no-op-loop,loop-deletion),invalidate<scalar-evolution>,loop(no-op-loop)' \
; RUN: | FileCheck %s --check-prefix=CHECK-SCEV-INV-AFTER-DELETE
-;
-; Test that BFI is invalidated after the loop adapter if any of the loop passes
-; invalidated it.
-; RUN: opt -disable-output -disable-verify -verify-analysis-invalidation=0 -debug-pass-manager %s -aa-pipeline= 2>&1 \
-; RUN: -O1 | FileCheck %s --check-prefix=CHECK-BFI-INV
define void @no_loops() {
; CHECK-LOOP-INV: Running pass: LoopSimplifyPass
@@ -247,28 +242,3 @@ l0.header:
exit:
ret void
}
-
-; CHECK-BFI-INV-LABEL: Running analysis: OuterAnalysisManagerProxy<{{.*}}> on loop %l0.header in function simplifiable_loop
-; CHECK-BFI-INV-NEXT: Running pass: LoopInstSimplifyPass on loop %l0.header in function simplifiable_loop
-; CHECK-BFI-INV-NEXT: Running pass: LoopSimplifyCFGPass on loop %l0.header in function simplifiable_loop
-; CHECK-BFI-INV-NEXT: Running pass: LICMPass on loop %l0.header in function simplifiable_loop
-; CHECK-BFI-INV-NEXT: Running pass: LoopRotatePass on loop %l0.header in function simplifiable_loop
-; CHECK-BFI-INV-NEXT: Running pass: LICMPass on loop %l0.header in function simplifiable_loop
-; CHECK-BFI-INV-NEXT: Running pass: SimpleLoopUnswitchPass on loop %l0.header in function simplifiable_loop
-; CHECK-BFI-INV-NEXT: Invalidating analysis: PostDominatorTreeAnalysis on simplifiable_loop
-; CHECK-BFI-INV-NEXT: Invalidating analysis: BranchProbabilityAnalysis on simplifiable_loop
-; CHECK-BFI-INV-NEXT: Invalidating analysis: BlockFrequencyAnalysis on simplifiable_loop
-; CHECK-BFI-INV-NEXT: Running pass: SimplifyCFGPass on simplifiable_loop (5 instructions)
-
-define void @simplifiable_loop(i1 %c) !prof !0 {
-entry:
- br label %l0.header
-
-l0.header:
- br label %l0.latch
-
-l0.latch:
- br i1 %c, label %l0.header, label %l0.latch
-}
-
-!0 = !{!"function_entry_count", i64 1}
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 94e860b..65b96c8 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -186,7 +186,6 @@
; CHECK-O-NEXT: Running pass: LoopRotatePass
; CHECK-O-NEXT: Running pass: LICM
; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index a08a140..3a0fffe 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -114,7 +114,6 @@
; CHECK-O-NEXT: Running pass: LoopRotatePass
; CHECK-O-NEXT: Running pass: LICM
; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index d9e2dd3..4623edc 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -100,7 +100,6 @@
; CHECK-O-NEXT: Running pass: LoopRotatePass
; CHECK-O-NEXT: Running pass: LICM
; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 2f6fa4b..590afd9 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -109,7 +109,6 @@
; CHECK-O-NEXT: Running pass: LoopRotatePass
; CHECK-O-NEXT: Running pass: LICM
; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 5aacd26..dd6acd2 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -146,7 +146,6 @@
; CHECK-O-NEXT: Running pass: LoopRotatePass
; CHECK-O-NEXT: Running pass: LICM
; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index f6a9406..ee05452 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -149,7 +149,6 @@
; CHECK-O-NEXT: Running pass: LoopRotatePass
; CHECK-O-NEXT: Running pass: LICM
; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 48a9433..fd95e94 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -114,7 +114,6 @@
; CHECK-O-NEXT: Running pass: LoopRotatePass
; CHECK-O-NEXT: Running pass: LICM
; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll b/llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll
index 7a4d860..fe2a002 100644
--- a/llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll
+++ b/llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll
@@ -17,11 +17,12 @@
; RUN: -r=%t/foo.o,foo,plx \
; RUN: -r=%t/foo.o,_Znam, \
; RUN: -memprof-dump-ccg \
-; RUN: -save-temps \
-; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -print-before=memprof-context-disambiguation \
+; RUN: -thinlto-threads=1 \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR
+
; DUMP: Callsite Context Graph:
-; RUN: llvm-dis %t.out.1.3.import.bc -o - | FileCheck %s --check-prefix=IR
; IR: @main()
; IR: !memprof {{.*}} !callsite
; IR: @_Znam(i64 0) #[[ATTR:[0-9]+]]
@@ -41,13 +42,12 @@
; RUN: -r=%t/foo.o,foo,plx \
; RUN: -r=%t/foo.o,_Znam, \
; RUN: -memprof-dump-ccg \
-; RUN: -save-temps \
+; RUN: -print-before=memprof-context-disambiguation \
+; RUN: -thinlto-threads=1 \
; RUN: -o %t.out 2>&1 | FileCheck %s --allow-empty \
-; RUN: --implicit-check-not "Callsite Context Graph:"
-
-; RUN: llvm-dis %t.out.1.3.import.bc -o - | FileCheck %s \
-; RUN: --implicit-check-not "!memprof" --implicit-check-not "!callsite" \
-; RUN: --implicit-check-not "memprof"="cold"
+; RUN: --implicit-check-not "Callsite Context Graph:" \
+; RUN: --implicit-check-not "!memprof" --implicit-check-not "!callsite" \
+; RUN: --implicit-check-not "memprof"="cold"
;--- main.ll
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll b/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll
index 9371fe2..dbd572d 100644
--- a/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll
+++ b/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -p indvars -S %s | FileCheck %s
+; RUN: opt -p indvars -data-layout='n32:64' -S %s | FileCheck --check-prefix=N32 %s
declare i1 @cond()
@@ -28,6 +29,32 @@ define i64 @test_ptr_compare_guard(ptr %start, ptr %end) {
; CHECK-NEXT: [[RES:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[RES_PH]], %[[EXIT_LOOPEXIT]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
+; N32-LABEL: define i64 @test_ptr_compare_guard(
+; N32-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
+; N32-NEXT: [[ENTRY:.*]]:
+; N32-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; N32-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; N32-NEXT: [[C_0:%.*]] = icmp eq ptr [[START]], [[END]]
+; N32-NEXT: br i1 [[C_0]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
+; N32: [[LOOP_HEADER_PREHEADER]]:
+; N32-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -1
+; N32-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; N32-NEXT: br label %[[LOOP_HEADER:.*]]
+; N32: [[LOOP_HEADER]]:
+; N32-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[LOOP_HEADER_PREHEADER]] ]
+; N32-NEXT: [[C_1:%.*]] = call i1 @cond()
+; N32-NEXT: br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[EXIT_LOOPEXIT:.*]]
+; N32: [[LOOP_LATCH]]:
+; N32-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1
+; N32-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; N32-NEXT: br i1 [[C_2]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_HEADER]]
+; N32: [[EXIT_LOOPEXIT]]:
+; N32-NEXT: [[RES_PH:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[TMP1]], %[[LOOP_LATCH]] ]
+; N32-NEXT: br label %[[EXIT]]
+; N32: [[EXIT]]:
+; N32-NEXT: [[RES:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[RES_PH]], %[[EXIT_LOOPEXIT]] ]
+; N32-NEXT: ret i64 [[RES]]
+;
entry:
%c.0 = icmp eq ptr %start, %end
br i1 %c.0, label %exit, label %loop.header
@@ -48,3 +75,149 @@ exit:
%res = phi i64 [ 0, %entry ], [ %i64.iv, %loop.latch ], [ 0, %loop.header ]
ret i64 %res
}
+
+define void @test_sub_cmp(ptr align 8 %start, ptr %end) {
+; CHECK-LABEL: define void @test_sub_cmp(
+; CHECK-SAME: ptr align 8 [[START:%.*]], ptr [[END:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT: [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT: [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]]
+; CHECK-NEXT: [[CMP_ENTRY:%.*]] = icmp eq ptr [[START]], [[END]]
+; CHECK-NEXT: br i1 [[CMP_ENTRY]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
+; CHECK: [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
+; CHECK: [[LOOP_HEADER]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT: [[C_1:%.*]] = call i1 @cond()
+; CHECK-NEXT: br i1 [[C_1]], label %[[EXIT_EARLY:.*]], label %[[LOOP_LATCH]]
+; CHECK: [[LOOP_LATCH]]:
+; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; CHECK-NEXT: [[CMP_LATCH:%.*]] = icmp ult i64 [[IV_NEXT]], [[PTR_DIFF]]
+; CHECK-NEXT: br i1 [[CMP_LATCH]], label %[[LOOP_HEADER]], label %[[EXIT_LOOPEXIT:.*]]
+; CHECK: [[EXIT_EARLY]]:
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+; N32-LABEL: define void @test_sub_cmp(
+; N32-SAME: ptr align 8 [[START:%.*]], ptr [[END:%.*]]) {
+; N32-NEXT: [[ENTRY:.*:]]
+; N32-NEXT: [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
+; N32-NEXT: [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
+; N32-NEXT: [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]]
+; N32-NEXT: [[CMP_ENTRY:%.*]] = icmp eq ptr [[START]], [[END]]
+; N32-NEXT: br i1 [[CMP_ENTRY]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
+; N32: [[LOOP_HEADER_PREHEADER]]:
+; N32-NEXT: br label %[[LOOP_HEADER:.*]]
+; N32: [[LOOP_HEADER]]:
+; N32-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[LOOP_HEADER_PREHEADER]] ]
+; N32-NEXT: [[C_1:%.*]] = call i1 @cond()
+; N32-NEXT: br i1 [[C_1]], label %[[EXIT_EARLY:.*]], label %[[LOOP_LATCH]]
+; N32: [[LOOP_LATCH]]:
+; N32-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; N32-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[PTR_DIFF]]
+; N32-NEXT: br i1 [[EXITCOND]], label %[[LOOP_HEADER]], label %[[EXIT_LOOPEXIT:.*]]
+; N32: [[EXIT_EARLY]]:
+; N32-NEXT: br label %[[EXIT]]
+; N32: [[EXIT_LOOPEXIT]]:
+; N32-NEXT: br label %[[EXIT]]
+; N32: [[EXIT]]:
+; N32-NEXT: ret void
+;
+entry:
+ %start.int = ptrtoint ptr %start to i64
+ %end.int = ptrtoint ptr %end to i64
+ %ptr.diff = sub i64 %start.int, %end.int
+ %cmp.entry = icmp eq ptr %start, %end
+ br i1 %cmp.entry, label %exit, label %loop.header
+
+loop.header:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %c.1 = call i1 @cond()
+ br i1 %c.1, label %exit.early, label %loop.latch
+
+loop.latch:
+ %iv.next = add i64 %iv, 1
+ %cmp.latch = icmp ult i64 %iv.next, %ptr.diff
+ br i1 %cmp.latch, label %loop.header, label %exit
+
+exit.early:
+ br label %exit
+
+exit:
+ ret void
+}
+
+
+define void @test_ptr_diff_with_assume(ptr align 8 %start, ptr align 8 %end, ptr %P) {
+; CHECK-LABEL: define void @test_ptr_diff_with_assume(
+; CHECK-SAME: ptr align 8 [[START:%.*]], ptr align 8 [[END:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT: [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT: [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]]
+; CHECK-NEXT: [[DIFF_CMP:%.*]] = icmp ult i64 [[PTR_DIFF]], 2
+; CHECK-NEXT: call void @llvm.assume(i1 [[DIFF_CMP]])
+; CHECK-NEXT: [[COMPUTED_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[PTR_DIFF]]
+; CHECK-NEXT: [[ENTRY_CMP:%.*]] = icmp eq ptr [[START]], [[END]]
+; CHECK-NEXT: br i1 [[ENTRY_CMP]], label %[[EXIT:.*]], label %[[LOOP_BODY_PREHEADER:.*]]
+; CHECK: [[LOOP_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[LOOP_BODY:.*]]
+; CHECK: [[LOOP_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_BODY]] ], [ [[START]], %[[LOOP_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call i1 @cond()
+; CHECK-NEXT: [[IV_NEXT]] = getelementptr i8, ptr [[IV]], i64 1
+; CHECK-NEXT: [[LOOP_CMP:%.*]] = icmp eq ptr [[IV_NEXT]], [[COMPUTED_END]]
+; CHECK-NEXT: br i1 [[LOOP_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_BODY]]
+; CHECK: [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+; N32-LABEL: define void @test_ptr_diff_with_assume(
+; N32-SAME: ptr align 8 [[START:%.*]], ptr align 8 [[END:%.*]], ptr [[P:%.*]]) {
+; N32-NEXT: [[ENTRY:.*:]]
+; N32-NEXT: [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
+; N32-NEXT: [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
+; N32-NEXT: [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]]
+; N32-NEXT: [[DIFF_CMP:%.*]] = icmp ult i64 [[PTR_DIFF]], 2
+; N32-NEXT: call void @llvm.assume(i1 [[DIFF_CMP]])
+; N32-NEXT: [[COMPUTED_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[PTR_DIFF]]
+; N32-NEXT: [[ENTRY_CMP:%.*]] = icmp eq ptr [[START]], [[END]]
+; N32-NEXT: br i1 [[ENTRY_CMP]], label %[[EXIT:.*]], label %[[LOOP_BODY_PREHEADER:.*]]
+; N32: [[LOOP_BODY_PREHEADER]]:
+; N32-NEXT: br label %[[LOOP_BODY:.*]]
+; N32: [[LOOP_BODY]]:
+; N32-NEXT: [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_BODY]] ], [ [[START]], %[[LOOP_BODY_PREHEADER]] ]
+; N32-NEXT: [[TMP0:%.*]] = call i1 @cond()
+; N32-NEXT: [[IV_NEXT]] = getelementptr i8, ptr [[IV]], i64 1
+; N32-NEXT: [[LOOP_CMP:%.*]] = icmp eq ptr [[IV_NEXT]], [[COMPUTED_END]]
+; N32-NEXT: br i1 [[LOOP_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_BODY]]
+; N32: [[EXIT_LOOPEXIT]]:
+; N32-NEXT: br label %[[EXIT]]
+; N32: [[EXIT]]:
+; N32-NEXT: ret void
+;
+entry:
+ %start.int = ptrtoint ptr %start to i64
+ %end.int = ptrtoint ptr %end to i64
+ %ptr.diff = sub i64 %start.int, %end.int
+ %diff.cmp = icmp ult i64 %ptr.diff, 2
+ call void @llvm.assume(i1 %diff.cmp)
+ %computed.end = getelementptr i8, ptr %start, i64 %ptr.diff
+ %entry.cmp = icmp eq ptr %start, %end
+ br i1 %entry.cmp, label %exit, label %loop.body
+
+loop.body:
+ %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop.body ]
+ call i1 @cond()
+ %iv.next = getelementptr i8, ptr %iv, i64 1
+ %loop.cmp = icmp eq ptr %iv.next, %computed.end
+ br i1 %loop.cmp, label %exit, label %loop.body
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/icmp-trunc.ll b/llvm/test/Transforms/InstCombine/icmp-trunc.ll
index b85deab..ad76ef7 100644
--- a/llvm/test/Transforms/InstCombine/icmp-trunc.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-trunc.ll
@@ -3,6 +3,7 @@
; RUN: opt < %s -passes=instcombine -S -data-layout="n8" | FileCheck %s --check-prefixes=CHECK,DL8
declare void @use(i8)
+declare void @use2(i4)
define i1 @ult_2(i32 %x) {
; CHECK-LABEL: @ult_2(
@@ -785,3 +786,32 @@ define <2 x i1> @uge_nsw_non_splat(<2 x i32> %x) {
ret <2 x i1> %r
}
+define i1 @trunc_icmp(i8 %a0) {
+; CHECK-LABEL: @trunc_icmp(
+; CHECK-NEXT: [[TZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0:%.*]], i1 false)
+; CHECK-NEXT: [[TR:%.*]] = trunc nuw i8 [[TZ]] to i4
+; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[A0]], 0
+; CHECK-NEXT: call void @use2(i4 [[TR]])
+; CHECK-NEXT: ret i1 [[C]]
+;
+ %tz = tail call range(i8 0, 9) i8 @llvm.cttz.i8(i8 %a0, i1 false)
+ %tr = trunc i8 %tz to i4
+ %c = icmp eq i4 %tr, 8
+ call void @use2(i4 %tr)
+ ret i1 %c
+}
+
+define i1 @do_not_mask_trunc_eq_i32_i8(i32 %x) {
+; DL64-LABEL: @do_not_mask_trunc_eq_i32_i8(
+; DL64-NEXT: [[R:%.*]] = icmp eq i32 [[X:%.*]], 42
+; DL64-NEXT: ret i1 [[R]]
+;
+; DL8-LABEL: @do_not_mask_trunc_eq_i32_i8(
+; DL8-NEXT: [[T:%.*]] = trunc nuw i32 [[X:%.*]] to i8
+; DL8-NEXT: [[R:%.*]] = icmp eq i8 [[T]], 42
+; DL8-NEXT: ret i1 [[R]]
+;
+ %t = trunc nuw i32 %x to i8
+ %r = icmp eq i8 %t, 42
+ ret i1 %r
+}
diff --git a/llvm/test/Transforms/InstCombine/scmp.ll b/llvm/test/Transforms/InstCombine/scmp.ll
index 2bf22ae..c0be5b9 100644
--- a/llvm/test/Transforms/InstCombine/scmp.ll
+++ b/llvm/test/Transforms/InstCombine/scmp.ll
@@ -423,6 +423,86 @@ define i8 @scmp_from_select_eq_and_gt_commuted3(i32 %x, i32 %y) {
ret i8 %r
}
+; Commutative tests for (x != y) ? (x > y ? 1 : -1) : 0
+define i8 @scmp_from_select_ne_and_gt_commuted1(i32 %x, i32 %y) {
+; CHECK-LABEL: define i8 @scmp_from_select_ne_and_gt_commuted1(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[Y]], i32 [[X]])
+; CHECK-NEXT: ret i8 [[R]]
+;
+ %ne = icmp ne i32 %x, %y
+ %gt = icmp slt i32 %x, %y
+ %sel1 = select i1 %gt, i8 1, i8 -1
+ %r = select i1 %ne, i8 %sel1, i8 0
+ ret i8 %r
+}
+
+define i8 @scmp_from_select_ne_and_gt_commuted2(i32 %x, i32 %y) {
+; CHECK-LABEL: define i8 @scmp_from_select_ne_and_gt_commuted2(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[Y]], i32 [[X]])
+; CHECK-NEXT: ret i8 [[R]]
+;
+ %ne = icmp ne i32 %x, %y
+ %gt = icmp sgt i32 %x, %y
+ %sel1 = select i1 %gt, i8 -1, i8 1
+ %r = select i1 %ne, i8 %sel1, i8 0
+ ret i8 %r
+}
+
+define i8 @scmp_from_select_ne_and_gt_commuted3(i32 %x, i32 %y) {
+; CHECK-LABEL: define i8 @scmp_from_select_ne_and_gt_commuted3(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[X]], i32 [[Y]])
+; CHECK-NEXT: ret i8 [[R]]
+;
+ %ne = icmp ne i32 %x, %y
+ %gt = icmp sgt i32 %x, %y
+ %sel1 = select i1 %gt, i8 1, i8 -1
+ %r = select i1 %ne, i8 %sel1, i8 0
+ ret i8 %r
+}
+
+; Commutative tests for x != C ? (x > C - 1 ? 1 : -1) : 0
+define i8 @scmp_from_select_ne_const_and_gt_commuted1(i32 %x) {
+; CHECK-LABEL: define i8 @scmp_from_select_ne_const_and_gt_commuted1(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[X]], i32 5)
+; CHECK-NEXT: ret i8 [[R]]
+;
+ %ne = icmp ne i32 %x, 5
+ %gt = icmp sgt i32 %x, 4
+ %sel1 = select i1 %gt, i8 1, i8 -1
+ %r = select i1 %ne, i8 %sel1, i8 0
+ ret i8 %r
+}
+
+define i8 @scmp_from_select_ne_const_and_gt_commuted2(i32 %x) {
+; CHECK-LABEL: define i8 @scmp_from_select_ne_const_and_gt_commuted2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[X]], i32 5)
+; CHECK-NEXT: ret i8 [[R]]
+;
+ %ne = icmp ne i32 %x, 5
+ %gt = icmp sgt i32 %x, 4
+ %sel1 = select i1 %gt, i8 1, i8 -1
+ %r = select i1 %ne, i8 %sel1, i8 0
+ ret i8 %r
+}
+
+define i8 @scmp_from_select_ne_const_and_gt_commuted3(i32 %x) {
+; CHECK-LABEL: define i8 @scmp_from_select_ne_const_and_gt_commuted3(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[X]], i32 5)
+; CHECK-NEXT: ret i8 [[R]]
+;
+ %ne = icmp ne i32 %x, 5
+ %gt = icmp sgt i32 %x, 4
+ %sel1 = select i1 %gt, i8 1, i8 -1
+ %r = select i1 %ne, i8 %sel1, i8 0
+ ret i8 %r
+}
+
define <3 x i2> @scmp_unary_shuffle_ops(<3 x i8> %x, <3 x i8> %y) {
; CHECK-LABEL: define <3 x i2> @scmp_unary_shuffle_ops(
; CHECK-SAME: <3 x i8> [[X:%.*]], <3 x i8> [[Y:%.*]]) {
@@ -436,6 +516,187 @@ define <3 x i2> @scmp_unary_shuffle_ops(<3 x i8> %x, <3 x i8> %y) {
ret <3 x i2> %r
}
+define i32 @scmp_sgt_slt(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_sgt_slt(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT: [[A_LOBIT:%.*]] = ashr i32 [[A]], 31
+; CHECK-NEXT: [[CMP_INV:%.*]] = icmp slt i32 [[A]], 1
+; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[CMP_INV]], i32 [[A_LOBIT]], i32 1
+; CHECK-NEXT: ret i32 [[RETVAL_0]]
+;
+ %cmp = icmp sgt i32 %a, 0
+ %cmp1 = icmp slt i32 %a, 0
+ %. = select i1 %cmp1, i32 -1, i32 0
+ %retval.0 = select i1 %cmp, i32 1, i32 %.
+ ret i32 %retval.0
+}
+
+define i32 @scmp_zero_slt(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_zero_slt(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT: [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT: ret i32 [[RETVAL_0]]
+;
+ %cmp = icmp eq i32 %a, 0
+ %cmp1.inv = icmp slt i32 %a, 1
+ %. = select i1 %cmp1.inv, i32 -1, i32 1
+ %retval.0 = select i1 %cmp, i32 0, i32 %.
+ ret i32 %retval.0
+}
+
+define i32 @scmp_zero_sgt(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_zero_sgt(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT: [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT: ret i32 [[RETVAL_0]]
+;
+ %cmp = icmp eq i32 %a, 0
+ %cmp1.inv = icmp sgt i32 %a, -1
+ %. = select i1 %cmp1.inv, i32 1, i32 -1
+ %retval.0 = select i1 %cmp, i32 0, i32 %.
+ ret i32 %retval.0
+}
+
+
+define i32 @scmp_zero_sgt_1(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_zero_sgt_1(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT: [[COND2:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT: ret i32 [[COND2]]
+;
+ %cmp = icmp eq i32 %a, 0
+ %cmp1 = icmp sgt i32 %a, -1
+ %cond = select i1 %cmp1, i32 1, i32 -1
+ %cond2 = select i1 %cmp, i32 0, i32 %cond
+ ret i32 %cond2
+}
+
+define i32 @scmp_zero_slt_1(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_zero_slt_1(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT: [[COND2:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT: ret i32 [[COND2]]
+;
+ %cmp = icmp eq i32 %a, 0
+ %cmp1 = icmp slt i32 %a, 1
+ %cond = select i1 %cmp1, i32 -1, i32 1
+ %cond2 = select i1 %cmp, i32 0, i32 %cond
+ ret i32 %cond2
+}
+
+define i32 @scmp_zero_slt_neg(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_zero_slt_neg(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[A]], -1
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP1]], i32 -1, i32 1
+; CHECK-NEXT: [[COND2:%.*]] = select i1 [[CMP]], i32 0, i32 [[COND]]
+; CHECK-NEXT: ret i32 [[COND2]]
+;
+ %cmp = icmp eq i32 %a, 0
+ %cmp1 = icmp slt i32 %a, -1
+ %cond = select i1 %cmp1, i32 -1, i32 1
+ %cond2 = select i1 %cmp, i32 0, i32 %cond
+ ret i32 %cond2
+}
+
+define i32 @scmp_zero_sgt_neg(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_zero_sgt_neg(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[A]], 1
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP1]], i32 1, i32 -1
+; CHECK-NEXT: [[COND2:%.*]] = select i1 [[CMP]], i32 0, i32 [[COND]]
+; CHECK-NEXT: ret i32 [[COND2]]
+;
+ %cmp = icmp eq i32 %a, 0
+ %cmp1 = icmp sgt i32 %a, 1
+ %cond = select i1 %cmp1, i32 1, i32 -1
+ %cond2 = select i1 %cmp, i32 0, i32 %cond
+ ret i32 %cond2
+}
+
+define i32 @ucmp_ugt_ult_neg(i32 %a) {
+; CHECK-LABEL: define i32 @ucmp_ugt_ult_neg(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp ne i32 [[A]], 0
+; CHECK-NEXT: [[RETVAL_0:%.*]] = zext i1 [[CMP_NOT]] to i32
+; CHECK-NEXT: ret i32 [[RETVAL_0]]
+;
+ %cmp = icmp ugt i32 %a, 0
+ %cmp1 = icmp ult i32 %a, 0
+ %. = select i1 %cmp1, i32 -1, i32 0
+ %retval.0 = select i1 %cmp, i32 1, i32 %.
+ ret i32 %retval.0
+}
+
+define i32 @ucmp_zero_ult_neg(i32 %a) {
+; CHECK-LABEL: define i32 @ucmp_zero_ult_neg(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[A]], 0
+; CHECK-NEXT: [[RETVAL_0:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT: ret i32 [[RETVAL_0]]
+;
+ %cmp = icmp eq i32 %a, 0
+ %cmp1.inv = icmp ult i32 %a, 1
+ %. = select i1 %cmp1.inv, i32 -1, i32 1
+ %retval.0 = select i1 %cmp, i32 0, i32 %.
+ ret i32 %retval.0
+}
+
+define i32 @ucmp_zero_ugt_neg(i32 %a) {
+; CHECK-LABEL: define i32 @ucmp_zero_ugt_neg(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[A]], 0
+; CHECK-NEXT: [[RETVAL_0:%.*]] = sext i1 [[CMP]] to i32
+; CHECK-NEXT: ret i32 [[RETVAL_0]]
+;
+ %cmp = icmp eq i32 %a, 0
+ %cmp1.inv = icmp ugt i32 %a, -1
+ %. = select i1 %cmp1.inv, i32 1, i32 -1
+ %retval.0 = select i1 %cmp, i32 0, i32 %.
+ ret i32 %retval.0
+}
+
+define i32 @scmp_sgt_slt_ab(i32 %a, i32 %b) {
+; CHECK-LABEL: define i32 @scmp_sgt_slt_ab(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT: [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: ret i32 [[RETVAL_0]]
+;
+ %cmp = icmp sgt i32 %a, %b
+ %cmp1 = icmp slt i32 %a, %b
+ %. = select i1 %cmp1, i32 -1, i32 0
+ %retval.0 = select i1 %cmp, i32 1, i32 %.
+ ret i32 %retval.0
+}
+
+define i32 @scmp_zero_slt_ab(i32 %a, i32 %b) {
+; CHECK-LABEL: define i32 @scmp_zero_slt_ab(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT: [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: ret i32 [[RETVAL_0]]
+;
+ %cmp = icmp eq i32 %a, %b
+ %cmp1.inv = icmp slt i32 %a, %b
+ %. = select i1 %cmp1.inv, i32 -1, i32 1
+ %retval.0 = select i1 %cmp, i32 0, i32 %.
+ ret i32 %retval.0
+}
+
+define i32 @scmp_zero_sgt_ab(i32 %a, i32 %b) {
+; CHECK-LABEL: define i32 @scmp_zero_sgt_ab(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT: [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: ret i32 [[RETVAL_0]]
+;
+ %cmp = icmp eq i32 %a, %b
+ %cmp1.inv = icmp sgt i32 %a, %b
+ %. = select i1 %cmp1.inv, i32 1, i32 -1
+ %retval.0 = select i1 %cmp, i32 0, i32 %.
+ ret i32 %retval.0
+}
+
; Negative test: true value of outer select is not zero
define i8 @scmp_from_select_eq_and_gt_neg1(i32 %x, i32 %y) {
; CHECK-LABEL: define i8 @scmp_from_select_eq_and_gt_neg1(
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll b/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll
index 7b30edb..71dad41 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -passes=instsimplify -S < %s | FileCheck %s
+; RUN: opt -passes=instsimplify -use-constant-int-for-fixed-length-splat -S < %s | FileCheck %s
; Test that intrinsics wasm call are constant folded
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll b/llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll
index 68b45a94..f68b85e 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat -S | FileCheck %s
declare i31 @llvm.ctpop.i31(i31 %val)
declare i32 @llvm.cttz.i32(i32 %val, i1)
@@ -120,6 +121,22 @@ define <2 x i31> @ctpop_vector() {
ret <2 x i31> %x
}
+define <2 x i31> @ctpop_vector_splat_v2i31() {
+; CHECK-LABEL: @ctpop_vector_splat_v2i31(
+; CHECK-NEXT: ret <2 x i31> splat (i31 1)
+;
+ %x = call <2 x i31> @llvm.ctpop.v2i31(<2 x i31> splat(i31 16))
+ ret <2 x i31> %x
+}
+
+define <vscale x 2 x i31> @ctpop_vector_splat_nxv2i31() {
+; CHECK-LABEL: @ctpop_vector_splat_nxv2i31(
+; CHECK-NEXT: ret <vscale x 2 x i31> splat (i31 1)
+;
+ %x = call <vscale x 2 x i31> @llvm.ctpop.nxv2i31(<vscale x 2 x i31> splat(i31 16))
+ ret <vscale x 2 x i31> %x
+}
+
define <2 x i31> @ctpop_vector_undef() {
; CHECK-LABEL: @ctpop_vector_undef(
; CHECK-NEXT: ret <2 x i31> zeroinitializer
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/bitreverse.ll b/llvm/test/Transforms/InstSimplify/ConstProp/bitreverse.ll
new file mode 100644
index 0000000..409141a
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/bitreverse.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat -S | FileCheck %s
+
+define i16 @W() {
+; CHECK-LABEL: define i16 @W() {
+; CHECK-NEXT: ret i16 -32768
+;
+ %Z = call i16 @llvm.bitreverse.i16(i16 1)
+ ret i16 %Z
+}
+
+define i32 @X() {
+; CHECK-LABEL: define i32 @X() {
+; CHECK-NEXT: ret i32 -2147483648
+;
+ %Z = call i32 @llvm.bitreverse.i32(i32 1)
+ ret i32 %Z
+}
+
+define i64 @Y() {
+; CHECK-LABEL: define i64 @Y() {
+; CHECK-NEXT: ret i64 -9223372036854775808
+;
+ %Z = call i64 @llvm.bitreverse.i64(i64 1)
+ ret i64 %Z
+}
+
+define i80 @Z() {
+; CHECK-LABEL: define i80 @Z() {
+; CHECK-NEXT: ret i80 23777929115895377691656
+;
+ %Z = call i80 @llvm.bitreverse.i80(i80 76151636403560493650080)
+ ret i80 %Z
+}
+
+define <4 x i32> @bitreverse_splat_v4i32() {
+; CHECK-LABEL: define <4 x i32> @bitreverse_splat_v4i32() {
+; CHECK-NEXT: ret <4 x i32> splat (i32 -2147483648)
+;
+ %Z = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> splat(i32 1))
+ ret <4 x i32> %Z
+}
+
+define <vscale x 4 x i32> @bitreverse_splat_nxv4i32() {
+; CHECK-LABEL: define <vscale x 4 x i32> @bitreverse_splat_nxv4i32() {
+; CHECK-NEXT: ret <vscale x 4 x i32> splat (i32 -2147483648)
+;
+ %Z = call <vscale x 4 x i32> @llvm.bitreverse.v4i32(<vscale x 4 x i32> splat(i32 1))
+ ret <vscale x 4 x i32> %Z
+}
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll b/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll
index 42bb733..4db8ced 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll
@@ -2,6 +2,7 @@
; bswap should be constant folded when it is passed a constant argument
; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat -S | FileCheck %s
declare i16 @llvm.bswap.i16(i16)
@@ -42,3 +43,19 @@ define i80 @Z() {
%Z = call i80 @llvm.bswap.i80( i80 76151636403560493650080 )
ret i80 %Z
}
+
+define <4 x i32> @bswap_splat_v4i32() {
+; CHECK-LABEL: define <4 x i32> @bswap_splat_v4i32() {
+; CHECK-NEXT: ret <4 x i32> splat (i32 16777216)
+;
+ %Z = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> splat(i32 1))
+ ret <4 x i32> %Z
+}
+
+define <vscale x 4 x i32> @bswap_splat_nxv4i32() {
+; CHECK-LABEL: define <vscale x 4 x i32> @bswap_splat_nxv4i32() {
+; CHECK-NEXT: ret <vscale x 4 x i32> splat (i32 16777216)
+;
+ %Z = call <vscale x 4 x i32> @llvm.bswap.v4i32(<vscale x 4 x i32> splat(i32 1))
+ ret <vscale x 4 x i32> %Z
+}
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
index e994921..9f9e3f9 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -S | FileCheck %s
declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a)
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a)
diff --git a/llvm/test/Transforms/LoopPredication/preserve-bpi.ll b/llvm/test/Transforms/LoopPredication/preserve-bpi.ll
deleted file mode 100644
index 7fbb197..0000000
--- a/llvm/test/Transforms/LoopPredication/preserve-bpi.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: opt -mtriple=x86_64 -passes='loop-mssa(loop-predication,licm,simple-loop-unswitch<nontrivial>,loop-simplifycfg)' -debug-pass-manager -debug-only=branch-prob -S < %s 2>&1 | FileCheck %s
-
-; REQUIRES: asserts
-
-; This test is to solely check that we do not run BPI every single time loop
-; predication is invoked (since BPI is preserved as part of
-; LoopStandardAnalysisResults).
-declare void @llvm.experimental.guard(i1, ...)
-
-; CHECK: Running pass: LoopPredicationPass on loop
-; CHECK-NEXT: Running pass: LICMPass on loop
-; CHECK-NEXT: Running pass: SimpleLoopUnswitchPass on loop
-; CHECK-NEXT: Running analysis: OuterAnalysisManagerProxy
-; CHECK-NEXT: Running pass: LoopPredicationPass on loop
-; CHECK-NEXT: Running pass: LICMPass on loop
-; CHECK-NEXT: Running pass: SimpleLoopUnswitchPass on loop
-; CHECK-NEXT: Running pass: LoopSimplifyCFGPass on loop
-
-define i32 @unsigned_loop_0_to_n_ult_check(ptr %array, i32 %length, i32 %n) {
-entry:
- %tmp5 = icmp eq i32 %n, 0
- br i1 %tmp5, label %exit, label %loop.preheader
-
-loop.preheader: ; preds = %entry
- br label %loop
-
-loop: ; preds = %guarded, %loop.preheader
- %loop.acc = phi i32 [ %loop.acc.next, %guarded ], [ 0, %loop.preheader ]
- %i = phi i32 [ %i.next, %guarded ], [ 0, %loop.preheader ]
- %within.bounds = icmp ult i32 %i, %length
- %widenable_cond = call i1 @llvm.experimental.widenable.condition()
- %exiplicit_guard_cond = and i1 %within.bounds, %widenable_cond
- br i1 %exiplicit_guard_cond, label %guarded, label %deopt, !prof !0
-
-deopt: ; preds = %loop
- %deoptcall = call i32 (...) @llvm.experimental.deoptimize.i32(i32 9) [ "deopt"() ]
- ret i32 %deoptcall
-
-guarded: ; preds = %loop
- %i.i64 = zext i32 %i to i64
- %array.i.ptr = getelementptr inbounds i32, ptr %array, i64 %i.i64
- %array.i = load i32, ptr %array.i.ptr, align 4
- %loop.acc.next = add i32 %loop.acc, %array.i
- %i.next = add nuw i32 %i, 1
- %continue = icmp ult i32 %i.next, %n
- br i1 %continue, label %loop, label %exit, !prof !2
-
-exit: ; preds = %guarded, %entry
- %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %guarded ]
- ret i32 %result
-}
-
-declare i32 @llvm.experimental.deoptimize.i32(...)
-declare i1 @llvm.experimental.widenable.condition() #0
-
-attributes #0 = { inaccessiblememonly nounwind }
-
-!0 = !{!"branch_weights", i32 1048576, i32 1}
-!1 = !{i32 1, i32 -2147483648}
-!2 = !{!"branch_weights", i32 1024, i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
index b106f99..1153d18 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
@@ -6,7 +6,7 @@
; Check that the addresses for a scalarized memory access is not extracted
; from a vector register.
-define i32 @foo(ptr nocapture %A) {
+define void @foo(ptr nocapture %A) {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
@@ -27,7 +27,7 @@ define i32 @foo(ptr nocapture %A) {
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.end:
-; CHECK-NEXT: ret i32 poison
+; CHECK-NEXT: ret void
;
entry:
@@ -44,12 +44,12 @@ for.body:
br i1 %exitcond, label %for.end, label %for.body
for.end:
- ret i32 poison
+ ret void
}
; Check that a load of address is scalarized.
-define i32 @foo1(ptr nocapture noalias %A, ptr nocapture %PtrPtr) {
+define void @foo1(ptr nocapture noalias %A, ptr nocapture %PtrPtr) {
; CHECK-LABEL: @foo1(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
@@ -74,7 +74,7 @@ define i32 @foo1(ptr nocapture noalias %A, ptr nocapture %PtrPtr) {
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.end:
-; CHECK-NEXT: ret i32 poison
+; CHECK-NEXT: ret void
;
entry:
@@ -93,5 +93,5 @@ for.body:
br i1 %exitcond, label %for.end, label %for.body
for.end:
- ret i32 poison
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/pr48832.ll b/llvm/test/Transforms/LoopVectorize/pr48832.ll
index b89be88..c6ebe85 100644
--- a/llvm/test/Transforms/LoopVectorize/pr48832.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr48832.ll
@@ -23,7 +23,7 @@ for.body: ; preds = %for.cond
br i1 true, label %cond.false, label %land.rhs
land.rhs: ; preds = %for.body
- br i1 poison, label %cond.end, label %cond.false
+ br i1 false, label %cond.end, label %cond.false
cond.false: ; preds = %for.body, %land.rhs
br label %cond.end
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
index d281905..abd1d96 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
@@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:128:128' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR128
; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:64:64' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR64
; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:32:32' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR32
@@ -10,179 +9,6 @@
target triple = "aarch64-unknown-unknown"
define void @multiply(ptr %A, ptr %B, ptr %C) {
-; PTR128-LABEL: @multiply(
-; PTR128-NEXT: entry:
-; PTR128-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i128
-; PTR128-NEXT: [[STORE_END:%.*]] = add nuw nsw i128 [[STORE_BEGIN]], 128
-; PTR128-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i128
-; PTR128-NEXT: [[TMP0:%.*]] = icmp ugt i128 [[STORE_END]], [[LOAD_BEGIN]]
-; PTR128-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
-; PTR128: alias_cont:
-; PTR128-NEXT: [[LOAD_END:%.*]] = add nuw nsw i128 [[LOAD_BEGIN]], 128
-; PTR128-NEXT: [[TMP1:%.*]] = icmp ugt i128 [[LOAD_END]], [[STORE_BEGIN]]
-; PTR128-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
-; PTR128: copy:
-; PTR128-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
-; PTR128-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
-; PTR128-NEXT: br label [[NO_ALIAS]]
-; PTR128: no_alias:
-; PTR128-NEXT: [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
-; PTR128-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i128
-; PTR128-NEXT: [[STORE_END5:%.*]] = add nuw nsw i128 [[STORE_BEGIN4]], 128
-; PTR128-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i128
-; PTR128-NEXT: [[TMP4:%.*]] = icmp ugt i128 [[STORE_END5]], [[LOAD_BEGIN6]]
-; PTR128-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
-; PTR128: alias_cont1:
-; PTR128-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i128 [[LOAD_BEGIN6]], 128
-; PTR128-NEXT: [[TMP5:%.*]] = icmp ugt i128 [[LOAD_END7]], [[STORE_BEGIN4]]
-; PTR128-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
-; PTR128: copy2:
-; PTR128-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
-; PTR128-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
-; PTR128-NEXT: br label [[NO_ALIAS3]]
-; PTR128: no_alias3:
-; PTR128-NEXT: [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
-; PTR128-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
-; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32
-; PTR128-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT: [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
-; PTR128-NEXT: [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32
-; PTR128-NEXT: [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
-; PTR128-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT: [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
-; PTR128-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
-; PTR128-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT: [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
-; PTR128-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT: [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
-; PTR128-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64
-; PTR128-NEXT: [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
-; PTR128-NEXT: [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96
-; PTR128-NEXT: [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
-; PTR128-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16
-; PTR128-NEXT: [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
-; PTR128-NEXT: [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48
-; PTR128-NEXT: [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
-; PTR128-NEXT: [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
-; PTR128-NEXT: [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT: [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
-; PTR128-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT: [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
-; PTR128-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
-; PTR128-NEXT: store <2 x double> [[TMP15]], ptr [[C]], align 8
-; PTR128-NEXT: [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i128 32
-; PTR128-NEXT: store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
-; PTR128-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16
-; PTR128-NEXT: [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
-; PTR128-NEXT: [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48
-; PTR128-NEXT: [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
-; PTR128-NEXT: [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
-; PTR128-NEXT: [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32
-; PTR128-NEXT: [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
-; PTR128-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT: [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
-; PTR128-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT: [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
-; PTR128-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT: [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
-; PTR128-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT: [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
-; PTR128-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80
-; PTR128-NEXT: [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
-; PTR128-NEXT: [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112
-; PTR128-NEXT: [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
-; PTR128-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16
-; PTR128-NEXT: [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
-; PTR128-NEXT: [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48
-; PTR128-NEXT: [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
-; PTR128-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
-; PTR128-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
-; PTR128-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
-; PTR128-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT: [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
-; PTR128-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i128 16
-; PTR128-NEXT: store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
-; PTR128-NEXT: [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i128 48
-; PTR128-NEXT: store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
-; PTR128-NEXT: [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
-; PTR128-NEXT: [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32
-; PTR128-NEXT: [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
-; PTR128-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64
-; PTR128-NEXT: [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
-; PTR128-NEXT: [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96
-; PTR128-NEXT: [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
-; PTR128-NEXT: [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT: [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
-; PTR128-NEXT: [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
-; PTR128-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT: [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
-; PTR128-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT: [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
-; PTR128-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64
-; PTR128-NEXT: [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
-; PTR128-NEXT: [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96
-; PTR128-NEXT: [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
-; PTR128-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80
-; PTR128-NEXT: [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
-; PTR128-NEXT: [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112
-; PTR128-NEXT: [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
-; PTR128-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
-; PTR128-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT: [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
-; PTR128-NEXT: [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
-; PTR128-NEXT: [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT: [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
-; PTR128-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i128 64
-; PTR128-NEXT: store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
-; PTR128-NEXT: [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i128 96
-; PTR128-NEXT: store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
-; PTR128-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16
-; PTR128-NEXT: [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
-; PTR128-NEXT: [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48
-; PTR128-NEXT: [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
-; PTR128-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64
-; PTR128-NEXT: [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
-; PTR128-NEXT: [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96
-; PTR128-NEXT: [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
-; PTR128-NEXT: [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT: [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
-; PTR128-NEXT: [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
-; PTR128-NEXT: [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT: [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
-; PTR128-NEXT: [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
-; PTR128-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80
-; PTR128-NEXT: [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
-; PTR128-NEXT: [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112
-; PTR128-NEXT: [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
-; PTR128-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80
-; PTR128-NEXT: [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
-; PTR128-NEXT: [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112
-; PTR128-NEXT: [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
-; PTR128-NEXT: [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT: [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
-; PTR128-NEXT: [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT: [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
-; PTR128-NEXT: [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT: [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
-; PTR128-NEXT: [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT: [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
-; PTR128-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i128 80
-; PTR128-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
-; PTR128-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i128 112
-; PTR128-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
-; PTR128-NEXT: ret void
-;
; PTR64-LABEL: @multiply(
; PTR64-NEXT: entry:
; PTR64-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i64
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
index 87def6b..3d05014 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
@@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:128:128' -S < %s | FileCheck %s --check-prefix=PTR128
; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:64:64' -S < %s | FileCheck %s --check-prefix=PTR64
; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:32:32' -S < %s | FileCheck %s --check-prefix=PTR32
@@ -7,128 +6,13 @@
; the need to emit `libc` calls), we perform strided index calculations using
; the same pointer bit-width as the matrix pointers, as determined by the data
; layout. To verify this behaviour, this test runs several strided loads and
-; stores through the lowering pass with (32|64|128)-bit pointers, and verifies
-; the generated code extends / truncates strides accordingly. Similarly,
+; stores through the lowering pass with (32|64)-bit pointers, and verifies the
+; generated code extends / truncates strides accordingly. Similarly,
; `data-layout-multiply-fused.ll` adopts this approach to verify the same
; behaviour for index calculations emitted while lowering fused matrix
; multiplies.
-define <9 x double> @strided_load_3x3_i128(ptr %in, i128 %stride) {
-; PTR128-LABEL: @strided_load_3x3_i128(
-; PTR128-NEXT: entry:
-; PTR128-NEXT: [[VEC_START:%.*]] = mul i128 0, [[STRIDE:%.*]]
-; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
-; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT: [[VEC_START1:%.*]] = mul i128 1, [[STRIDE]]
-; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
-; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT: [[VEC_START4:%.*]] = mul i128 2, [[STRIDE]]
-; PTR128-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
-; PTR128-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
-; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT: ret <9 x double> [[TMP2]]
-;
-; PTR64-LABEL: @strided_load_3x3_i128(
-; PTR64-NEXT: entry:
-; PTR64-NEXT: [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i64
-; PTR64-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
-; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
-; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR64-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
-; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
-; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR64-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE_CAST]]
-; PTR64-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
-; PTR64-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
-; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR64-NEXT: ret <9 x double> [[TMP2]]
-;
-; PTR32-LABEL: @strided_load_3x3_i128(
-; PTR32-NEXT: entry:
-; PTR32-NEXT: [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i32
-; PTR32-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE_CAST]]
-; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
-; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR32-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE_CAST]]
-; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
-; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR32-NEXT: [[VEC_START4:%.*]] = mul i32 2, [[STRIDE_CAST]]
-; PTR32-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
-; PTR32-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
-; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR32-NEXT: ret <9 x double> [[TMP2]]
-;
-entry:
- %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 %stride, i1 false, i32 3, i32 3)
- ret <9 x double> %load
-}
-
-define <9 x double> @strided_load_3x3_const_stride_i128(ptr %in) {
-; PTR128-LABEL: @strided_load_3x3_const_stride_i128(
-; PTR128-NEXT: entry:
-; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
-; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
-; PTR128-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
-; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT: ret <9 x double> [[TMP2]]
-;
-; PTR64-LABEL: @strided_load_3x3_const_stride_i128(
-; PTR64-NEXT: entry:
-; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
-; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
-; PTR64-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
-; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR64-NEXT: ret <9 x double> [[TMP2]]
-;
-; PTR32-LABEL: @strided_load_3x3_const_stride_i128(
-; PTR32-NEXT: entry:
-; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
-; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
-; PTR32-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
-; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR32-NEXT: ret <9 x double> [[TMP2]]
-;
-entry:
- %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 16, i1 false, i32 3, i32 3)
- ret <9 x double> %load
-}
-
define <9 x double> @strided_load_3x3_i64(ptr %in, i64 %stride) {
-; PTR128-LABEL: @strided_load_3x3_i64(
-; PTR128-NEXT: entry:
-; PTR128-NEXT: [[STRIDE_CAST:%.*]] = zext i64 [[STRIDE:%.*]] to i128
-; PTR128-NEXT: [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]]
-; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
-; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT: [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]]
-; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
-; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT: [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]]
-; PTR128-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
-; PTR128-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
-; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT: ret <9 x double> [[TMP2]]
-;
; PTR64-LABEL: @strided_load_3x3_i64(
; PTR64-NEXT: entry:
; PTR64-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
@@ -168,18 +52,6 @@ entry:
}
define <9 x double> @strided_load_3x3_const_stride_i64(ptr %in) {
-; PTR128-LABEL: @strided_load_3x3_const_stride_i64(
-; PTR128-NEXT: entry:
-; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
-; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
-; PTR128-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
-; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT: ret <9 x double> [[TMP2]]
-;
; PTR64-LABEL: @strided_load_3x3_const_stride_i64(
; PTR64-NEXT: entry:
; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
@@ -210,23 +82,6 @@ entry:
}
define <9 x double> @strided_load_3x3_i32(ptr %in, i32 %stride) {
-; PTR128-LABEL: @strided_load_3x3_i32(
-; PTR128-NEXT: entry:
-; PTR128-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i128
-; PTR128-NEXT: [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]]
-; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
-; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT: [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]]
-; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
-; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT: [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]]
-; PTR128-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
-; PTR128-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
-; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT: ret <9 x double> [[TMP2]]
-;
; PTR64-LABEL: @strided_load_3x3_i32(
; PTR64-NEXT: entry:
; PTR64-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
@@ -266,18 +121,6 @@ entry:
}
define <9 x double> @strided_load_3x3_const_stride_i32(ptr %in) {
-; PTR128-LABEL: @strided_load_3x3_const_stride_i32(
-; PTR128-NEXT: entry:
-; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
-; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
-; PTR128-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
-; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT: ret <9 x double> [[TMP2]]
-;
; PTR64-LABEL: @strided_load_3x3_const_stride_i32(
; PTR64-NEXT: entry:
; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
@@ -307,6 +150,5 @@ entry:
ret <9 x double> %load
}
-declare <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr, i128, i1, i32, i32)
declare <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr, i64, i1, i32, i32)
declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll
new file mode 100644
index 0000000..4ec5898
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=96 -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=NO_SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=64 -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s
+
+; REQUIRES: aarch64-registered-target
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:8:32:64-S128"
+target triple = "aarch64-apple-ios"
+
+define void @matmul(ptr %a, ptr %b, ptr %c) {
+; SPLIT_REMAINDER-LABEL: define void @matmul(
+; SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; SPLIT_REMAINDER-NEXT: [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; SPLIT_REMAINDER-NEXT: [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[B]], align 4
+; SPLIT_REMAINDER-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[B]], i64 3
+; SPLIT_REMAINDER-NEXT: [[COL_LOAD2:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; SPLIT_REMAINDER-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr [[B]], i64 6
+; SPLIT_REMAINDER-NEXT: [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[VEC_GEP3]], align 4
+; SPLIT_REMAINDER-NEXT: [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[SPLAT_SPLAT]], [[BLOCK]]
+; SPLIT_REMAINDER-NEXT: [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT6]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[SPLAT_SPLAT7]], [[BLOCK5]]
+; SPLIT_REMAINDER-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP2]], [[TMP4]]
+; SPLIT_REMAINDER-NEXT: [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT: [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT9]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[SPLAT_SPLAT10]], [[BLOCK8]]
+; SPLIT_REMAINDER-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
+; SPLIT_REMAINDER-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; SPLIT_REMAINDER-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 2>
+; SPLIT_REMAINDER-NEXT: [[BLOCK11:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT: [[TMP11:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT12]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP12:%.*]] = fmul <1 x float> [[SPLAT_SPLAT13]], [[BLOCK11]]
+; SPLIT_REMAINDER-NEXT: [[BLOCK14:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT: [[TMP13:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT15]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP14:%.*]] = fmul <1 x float> [[SPLAT_SPLAT16]], [[BLOCK14]]
+; SPLIT_REMAINDER-NEXT: [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]]
+; SPLIT_REMAINDER-NEXT: [[BLOCK17:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT: [[TMP16:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT18]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP17:%.*]] = fmul <1 x float> [[SPLAT_SPLAT19]], [[BLOCK17]]
+; SPLIT_REMAINDER-NEXT: [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]]
+; SPLIT_REMAINDER-NEXT: [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; SPLIT_REMAINDER-NEXT: [[TMP20:%.*]] = shufflevector <3 x float> [[TMP10]], <3 x float> [[TMP19]], <3 x i32> <i32 0, i32 1, i32 3>
+; SPLIT_REMAINDER-NEXT: store <3 x float> [[TMP20]], ptr [[C]], align 4
+; SPLIT_REMAINDER-NEXT: ret void
+;
+; NO_SPLIT_REMAINDER-LABEL: define void @matmul(
+; NO_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[B]], align 4
+; NO_SPLIT_REMAINDER-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[B]], i64 3
+; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD2:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; NO_SPLIT_REMAINDER-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr [[B]], i64 6
+; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[VEC_GEP3]], align 4
+; NO_SPLIT_REMAINDER-NEXT: [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i64 0
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT: [[TMP2:%.*]] = fmul <3 x float> [[SPLAT_SPLAT]], [[BLOCK]]
+; NO_SPLIT_REMAINDER-NEXT: [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <3 x float> poison, float [[TMP3]], i64 0
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT6]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT: [[TMP4:%.*]] = fmul <3 x float> [[SPLAT_SPLAT7]], [[BLOCK5]]
+; NO_SPLIT_REMAINDER-NEXT: [[TMP5:%.*]] = fadd <3 x float> [[TMP2]], [[TMP4]]
+; NO_SPLIT_REMAINDER-NEXT: [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT: [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT9:%.*]] = insertelement <3 x float> poison, float [[TMP6]], i64 0
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT9]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[SPLAT_SPLAT10]], [[BLOCK8]]
+; NO_SPLIT_REMAINDER-NEXT: [[TMP8:%.*]] = fadd <3 x float> [[TMP5]], [[TMP7]]
+; NO_SPLIT_REMAINDER-NEXT: [[TMP9:%.*]] = shufflevector <3 x float> [[TMP8]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 5>
+; NO_SPLIT_REMAINDER-NEXT: store <3 x float> [[TMP10]], ptr [[C]], align 4
+; NO_SPLIT_REMAINDER-NEXT: ret void
+;
+ %a_load = load <3 x float>, ptr %a, align 4
+ %b_load = load <9 x float>, ptr %b, align 4
+ %matmul = tail call <3 x float> @llvm.matrix.multiply.v3f32.v9f32.v3f32(<3 x float> %a_load, <9 x float> %b_load, i32 1, i32 3, i32 3)
+ store <3 x float> %matmul, ptr %c, align 4
+ ret void
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll
new file mode 100644
index 0000000..fbc2cbc
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=96 -S < %s | FileCheck --check-prefix=NO_SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=64 -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s
+
+; REQUIRES: aarch64-registered-target
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:8:32:64-S128"
+target triple = "aarch64-apple-ios"
+
+define void @matmul(ptr %a, ptr %b, ptr %c) {
+; SPLIT_REMAINDER-LABEL: define void @matmul(
+; SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; SPLIT_REMAINDER-NEXT: [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; SPLIT_REMAINDER-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[A]], i64 3
+; SPLIT_REMAINDER-NEXT: [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; SPLIT_REMAINDER-NEXT: [[VEC_GEP2:%.*]] = getelementptr float, ptr [[A]], i64 6
+; SPLIT_REMAINDER-NEXT: [[COL_LOAD3:%.*]] = load <3 x float>, ptr [[VEC_GEP2]], align 4
+; SPLIT_REMAINDER-NEXT: [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[B]], align 4
+; SPLIT_REMAINDER-NEXT: [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[BLOCK]], [[SPLAT_SPLAT]]
+; SPLIT_REMAINDER-NEXT: [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT6]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[BLOCK5]], [[SPLAT_SPLAT7]]
+; SPLIT_REMAINDER-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP2]], [[TMP4]]
+; SPLIT_REMAINDER-NEXT: [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT: [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT9]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[BLOCK8]], [[SPLAT_SPLAT10]]
+; SPLIT_REMAINDER-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
+; SPLIT_REMAINDER-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; SPLIT_REMAINDER-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 2>
+; SPLIT_REMAINDER-NEXT: [[BLOCK11:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT: [[TMP11:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT12]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP12:%.*]] = fmul <1 x float> [[BLOCK11]], [[SPLAT_SPLAT13]]
+; SPLIT_REMAINDER-NEXT: [[BLOCK14:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT: [[TMP13:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT15]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP14:%.*]] = fmul <1 x float> [[BLOCK14]], [[SPLAT_SPLAT16]]
+; SPLIT_REMAINDER-NEXT: [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]]
+; SPLIT_REMAINDER-NEXT: [[BLOCK17:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT: [[TMP16:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT18]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP17:%.*]] = fmul <1 x float> [[BLOCK17]], [[SPLAT_SPLAT19]]
+; SPLIT_REMAINDER-NEXT: [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]]
+; SPLIT_REMAINDER-NEXT: [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; SPLIT_REMAINDER-NEXT: [[TMP20:%.*]] = shufflevector <3 x float> [[TMP10]], <3 x float> [[TMP19]], <3 x i32> <i32 0, i32 1, i32 3>
+; SPLIT_REMAINDER-NEXT: store <3 x float> [[TMP20]], ptr [[C]], align 4
+; SPLIT_REMAINDER-NEXT: ret void
+;
+; NO_SPLIT_REMAINDER-LABEL: define void @matmul(
+; NO_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; NO_SPLIT_REMAINDER-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[A]], i64 3
+; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; NO_SPLIT_REMAINDER-NEXT: [[VEC_GEP2:%.*]] = getelementptr float, ptr [[A]], i64 6
+; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD3:%.*]] = load <3 x float>, ptr [[VEC_GEP2]], align 4
+; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[B]], align 4
+; NO_SPLIT_REMAINDER-NEXT: [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i64 0
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT: [[TMP2:%.*]] = fmul <3 x float> [[BLOCK]], [[SPLAT_SPLAT]]
+; NO_SPLIT_REMAINDER-NEXT: [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <3 x float> poison, float [[TMP3]], i64 0
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT6]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT: [[TMP4:%.*]] = fmul <3 x float> [[BLOCK5]], [[SPLAT_SPLAT7]]
+; NO_SPLIT_REMAINDER-NEXT: [[TMP5:%.*]] = fadd <3 x float> [[TMP2]], [[TMP4]]
+; NO_SPLIT_REMAINDER-NEXT: [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT: [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT9:%.*]] = insertelement <3 x float> poison, float [[TMP6]], i64 0
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT9]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[BLOCK8]], [[SPLAT_SPLAT10]]
+; NO_SPLIT_REMAINDER-NEXT: [[TMP8:%.*]] = fadd <3 x float> [[TMP5]], [[TMP7]]
+; NO_SPLIT_REMAINDER-NEXT: [[TMP9:%.*]] = shufflevector <3 x float> [[TMP8]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 5>
+; NO_SPLIT_REMAINDER-NEXT: store <3 x float> [[TMP10]], ptr [[C]], align 4
+; NO_SPLIT_REMAINDER-NEXT: ret void
+;
+ %a_load = load <9 x float>, ptr %a, align 4
+ %b_load = load <3 x float>, ptr %b, align 4
+ %matmul = tail call <3 x float> @llvm.matrix.multiply.v9f32.v3f32.v3f32(<9 x float> %a_load, <3 x float> %b_load, i32 3, i32 3, i32 1)
+ store <3 x float> %matmul, ptr %c, align 4
+ ret void
+}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll b/llvm/test/Transforms/PhaseOrdering/unswitch-cold-func.ll
index 239397b..a6ebdf0 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll
+++ b/llvm/test/Transforms/PhaseOrdering/unswitch-cold-func.ll
@@ -1,13 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes='require<profile-summary>,function(loop-mssa(simple-loop-unswitch<nontrivial>))' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-force-function-attrs,function(loop-mssa(simple-loop-unswitch<nontrivial>))' -pgo-kind=pgo-instr-use-pipeline -pgo-cold-func-opt=optsize -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-force-function-attrs,function(loop-mssa(simple-loop-unswitch<nontrivial>))' -pgo-kind=pgo-instr-use-pipeline -pgo-cold-func-opt=minsize -S | FileCheck %s
;; Check that non-trivial loop unswitching is not applied to a cold loop in a
;; cold loop nest.
;; IR was generated from the following loop nest, profiled when called
;; with M=0 and N=0.
-;; void hotFunction(bool cond, int M, int N, int * A, int *B, int *C) {
+;; void function(bool cond, int M, int N, int * A, int *B, int *C) {
;; for (unsigned j = 0; j < M; j++)
;; for (unsigned i=0; i < N; i++) {
;; A[i] = B[i] + C[i];
@@ -15,8 +16,8 @@
;; }
;; }
-define void @_Z11hotFunctionbiiPiS_S_(i1 %cond, i32 %M, i32 %N, ptr %A, ptr %B, ptr %C) !prof !36 {
-; CHECK-LABEL: define void @_Z11hotFunctionbiiPiS_S_
+define void @_Z11functionbiiPiS_S_(i1 %cond, i32 %M, i32 %N, ptr %A, ptr %B, ptr %C) !prof !36 {
+; CHECK-LABEL: define void @_Z11functionbiiPiS_S_
; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {{.*}}{
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP19_NOT:%.*]] = icmp eq i32 [[M]], 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/last-non-copyable-inst-used-outside-bb.ll b/llvm/test/Transforms/SLPVectorizer/X86/last-non-copyable-inst-used-outside-bb.ll
new file mode 100644
index 0000000..2f97b41
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/last-non-copyable-inst-used-outside-bb.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-99999 < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT: [[BB:.*]]:
+; CHECK-NEXT: br label %[[BB1:.*]]
+; CHECK: [[BB1]]:
+; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP7:%.*]], %[[BB16:.*]] ], [ zeroinitializer, %[[BB1]] ]
+; CHECK-NEXT: br i1 false, label %[[BB1]], label %[[BB5:.*]]
+; CHECK: [[BB5]]:
+; CHECK-NEXT: [[PHI8:%.*]] = phi double [ 0.000000e+00, %[[BB16]] ], [ 0.000000e+00, %[[BB1]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = phi <4 x i32> [ [[TMP8:%.*]], %[[BB16]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB1]] ]
+; CHECK-NEXT: switch i32 0, label %[[BB21:.*]] [
+; CHECK-NEXT: i32 4, label %[[BB21]]
+; CHECK-NEXT: i32 1, label %[[BB21]]
+; CHECK-NEXT: i32 0, label %[[BB9:.*]]
+; CHECK-NEXT: ]
+; CHECK: [[BB9]]:
+; CHECK-NEXT: [[PHI13:%.*]] = phi double [ 0.000000e+00, %[[BB21]] ], [ 0.000000e+00, %[[BB5]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ [[TMP1]], %[[BB21]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB5]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i32> [ [[TMP9:%.*]], %[[BB21]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB5]] ]
+; CHECK-NEXT: switch i32 0, label %[[BB15:.*]] [
+; CHECK-NEXT: i32 1, label %[[BB14:.*]]
+; CHECK-NEXT: i32 0, label %[[BB16]]
+; CHECK-NEXT: ]
+; CHECK: [[BB14]]:
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT: br label %[[BB16]]
+; CHECK: [[BB15]]:
+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>, [[TMP2]]
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP5]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 7>
+; CHECK-NEXT: br label %[[BB16]]
+; CHECK: [[BB16]]:
+; CHECK-NEXT: [[PHI20:%.*]] = phi double [ 0.000000e+00, %[[BB15]] ], [ 0.000000e+00, %[[BB14]] ], [ 0.000000e+00, %[[BB9]] ]
+; CHECK-NEXT: [[TMP7]] = phi <4 x i32> [ [[TMP5]], %[[BB15]] ], [ [[TMP4]], %[[BB14]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB9]] ]
+; CHECK-NEXT: [[TMP8]] = phi <4 x i32> [ [[TMP6]], %[[BB15]] ], [ [[TMP3]], %[[BB14]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB9]] ]
+; CHECK-NEXT: br i1 false, label %[[BB5]], label %[[BB1]]
+; CHECK: [[BB21]]:
+; CHECK-NEXT: [[TMP9]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT: br label %[[BB9]]
+;
+bb:
+ br label %bb1
+
+bb1:
+ %phi = phi i32 [ 0, %bb ], [ 0, %bb1 ], [ %phi17, %bb16 ]
+ %phi2 = phi i32 [ 0, %bb ], [ 0, %bb1 ], [ %phi18, %bb16 ]
+ %phi3 = phi i32 [ 0, %bb ], [ poison, %bb16 ], [ 0, %bb1 ]
+ %phi4 = phi i32 [ 0, %bb ], [ poison, %bb16 ], [ 0, %bb1 ]
+ br i1 false, label %bb1, label %bb5
+
+bb5:
+ %phi6 = phi i32 [ %phi17, %bb16 ], [ 0, %bb1 ]
+ %phi7 = phi i32 [ %phi19, %bb16 ], [ 0, %bb1 ]
+ %phi8 = phi double [ 0.000000e+00, %bb16 ], [ 0.000000e+00, %bb1 ]
+ switch i32 0, label %bb21 [
+ i32 4, label %bb21
+ i32 1, label %bb21
+ i32 0, label %bb9
+ ]
+
+bb9:
+ %phi10 = phi i32 [ %phi6, %bb21 ], [ 0, %bb5 ]
+ %phi11 = phi i32 [ %phi7, %bb21 ], [ 0, %bb5 ]
+ %phi12 = phi i32 [ 0, %bb21 ], [ 0, %bb5 ]
+ %phi13 = phi double [ 0.000000e+00, %bb21 ], [ 0.000000e+00, %bb5 ]
+ switch i32 0, label %bb15 [
+ i32 1, label %bb14
+ i32 0, label %bb16
+ ]
+
+bb14:
+ br label %bb16
+
+bb15:
+ %add = add i32 0, %phi10
+ br label %bb16
+
+bb16:
+ %phi17 = phi i32 [ %add, %bb15 ], [ %phi10, %bb14 ], [ 0, %bb9 ]
+ %phi18 = phi i32 [ %phi11, %bb15 ], [ 0, %bb14 ], [ 0, %bb9 ]
+ %phi19 = phi i32 [ %phi12, %bb15 ], [ %phi12, %bb14 ], [ 0, %bb9 ]
+ %phi20 = phi double [ 0.000000e+00, %bb15 ], [ 0.000000e+00, %bb14 ], [ 0.000000e+00, %bb9 ]
+ br i1 false, label %bb5, label %bb1
+
+bb21:
+ br label %bb9
+}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll
index 9ab713c..383407b 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll
@@ -18,7 +18,6 @@
; the analysis caches.
;
; CHECK: Running pass: SimpleLoopUnswitchPass on loop %loop_begin in function test6
-; CHECK-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-NEXT: Clearing all analysis results for: loop_a_inner
diff --git a/llvm/test/Verifier/matrix-intrinsics.ll b/llvm/test/Verifier/matrix-intrinsics.ll
index b6d5ad9..43d1a79 100644
--- a/llvm/test/Verifier/matrix-intrinsics.ll
+++ b/llvm/test/Verifier/matrix-intrinsics.ll
@@ -1,8 +1,7 @@
-; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not opt -S %s 2>&1 | FileCheck %s
define <4 x float> @transpose(<4 x float> %m, i32 %arg) {
-; CHECK: assembly parsed, but does not verify as correct!
-; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector!
+; CHECK: Result of a matrix operation does not fit in the returned vector!
; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector!
; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector!
; CHECK-NEXT: immarg operand has non-immediate parameter
@@ -118,16 +117,34 @@ define void @column.major_store_stride_too_small(ptr %m, i64 %arg) {
ret void
}
+define <4 x float> @column.major_load_stride_i128(ptr %m, i32 %arg) {
+; CHECK-NEXT: Stride bitwidth cannot exceed 64!
+; CHECK-NEXT: ptr @llvm.matrix.column.major.load.v4f32.i128
+ %result.1 = call <4 x float> @llvm.matrix.column.major.load.v4f32.i128(ptr %m, i128 u0x10000000000000000, i1 false, i32 2, i32 2)
+ ret <4 x float> %result.1
+}
+
+define void @column.major_store_stride_i128(ptr %m, i64 %arg) {
+; CHECK-NEXT: Stride bitwidth cannot exceed 64!
+; CHECK-NEXT: ptr @llvm.matrix.column.major.store.v4f32.i128
+ call void @llvm.matrix.column.major.store.v4f32.i128(<4 x float> zeroinitializer, ptr %m, i128 u0x10000000000000000, i1 false, i32 2, i32 2)
+ ret void
+}
+
declare <4 x i32> @llvm.matrix.column.major.load.v4i32.i64(ptr, i64, i1, i32, i32)
declare <4 x float> @llvm.matrix.column.major.load.v4f32.p0(ptr, i64, i1, i32, i32)
declare <4 x float> @llvm.matrix.column.major.load.v4f32.i64(ptr, i64, i1, i32, i32)
declare <6 x float> @llvm.matrix.column.major.load.v6f32.i64(ptr, i64, i1, i32, i32)
+declare <6 x float> @llvm.matrix.column.major.load.v6f32.i8(ptr, i8, i1, i32, i32)
+declare <6 x float> @llvm.matrix.column.major.load.v6f32.i128(ptr, i28, i1, i32, i32)
declare void @llvm.matrix.column.major.store.v4f32.i64(<4 x float>, ptr, i64, i1, i32, i32)
declare void @llvm.matrix.column.major.store.v6f32.i64(<6 x float>, ptr, i64, i1, i32, i32)
declare void @llvm.matrix.column.major.store.v4i32.vi32(<4 x i32>, ptr, i64, i1, i32, i32)
declare void @llvm.matrix.column.major.store.v4f32.p0(<4 x float>, ptr, i64, i1, i32, i32)
declare void @llvm.matrix.column.major.store.v4p0.i64(<4 x ptr>, ptr, i64, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v4p0.i8(<4 x ptr>, ptr, i8, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v4p0.i128(<4 x ptr>, ptr, i128, i1, i32, i32)
declare <4 x i32> @llvm.matrix.transpose.v4i32.v4f32(<4 x float>, i32, i32)
declare <4 x float> @llvm.matrix.transpose.v4f32(<4 x float>, i32, i32)
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/vrgather-vcompress.s
index 4ec1683..4ec1683 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/vrgather-vcompress.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vgather-vcompress.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vrgather-vcompress.s
index 5ebed10..5ebed10 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vgather-vcompress.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vrgather-vcompress.s
diff --git a/llvm/test/tools/llvm-reduce/reduce-instructions-alloca.ll b/llvm/test/tools/llvm-reduce/reduce-instructions-alloca.ll
new file mode 100644
index 0000000..94b45d2
--- /dev/null
+++ b/llvm/test/tools/llvm-reduce/reduce-instructions-alloca.ll
@@ -0,0 +1,16 @@
+; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=instructions --test FileCheck --test-arg --check-prefixes=CHECK,INTERESTING --test-arg %s --test-arg --input-file %s -o %t
+; RUN: FileCheck -check-prefixes=CHECK,RESULT %s < %t
+
+; CHECK-LABEL: define void @alloca(
+; INTERESTING: call void @llvm.lifetime.start.p0(
+; INTERESTING: call void @llvm.lifetime.end.p0(
+
+; RESULT: call void @llvm.lifetime.start.p0(ptr poison)
+; RESULT-NEXT: call void @llvm.lifetime.end.p0(ptr poison)
+; RESULT-NEXT: ret void
+define void @alloca(ptr %ptr) {
+ %alloca = alloca i32, align 4
+ call void @llvm.lifetime.start.p0(ptr %alloca)
+ call void @llvm.lifetime.end.p0(ptr %alloca)
+ ret void
+}
diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
index a2f4b3e..9db7aa0 100644
--- a/llvm/tools/llvm-c-test/debuginfo.c
+++ b/llvm/tools/llvm-c-test/debuginfo.c
@@ -43,6 +43,9 @@ int llvm_test_dibuilder(void) {
LLVMMetadataRef File = LLVMDIBuilderCreateFile(DIB, Filename,
strlen(Filename), ".", 1);
+ LLVMMetadataRef FileCS = LLVMDIBuilderCreateFileWithChecksum(
+ DIB, Filename, strlen(Filename), ".", 1, CSK_MD5, "1234", 4, "source", 6);
+
LLVMMetadataRef CompileUnit = LLVMDIBuilderCreateCompileUnit(
DIB, LLVMDWARFSourceLanguageC, File, "llvm-c-test", 11, 0, NULL, 0, 0,
NULL, 0, LLVMDWARFEmissionFull, 0, 0, 0, "/", 1, "", 0);
@@ -61,7 +64,7 @@ int llvm_test_dibuilder(void) {
"/test/include/llvm-c-test-import.h", 34,
"", 0);
LLVMMetadataRef ImportedModule = LLVMDIBuilderCreateImportedModuleFromModule(
- DIB, Module, OtherModule, File, 42, NULL, 0);
+ DIB, Module, OtherModule, FileCS, 42, NULL, 0);
LLVMDIBuilderCreateImportedModuleFromAlias(DIB, Module, ImportedModule, File,
42, NULL, 0);
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp b/llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp
index f1f5d6b..19b69e8 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp
@@ -13,6 +13,8 @@
#include "ReduceInstructions.h"
#include "Utils.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
using namespace llvm;
@@ -37,7 +39,9 @@ void llvm::reduceInstructionsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) {
for (auto &Inst :
make_early_inc_range(make_range(BB.begin(), std::prev(BB.end())))) {
if (!shouldAlwaysKeep(Inst) && !O.shouldKeep()) {
- Inst.replaceAllUsesWith(getDefaultValue(Inst.getType()));
+ Inst.replaceAllUsesWith(isa<AllocaInst>(Inst)
+ ? PoisonValue::get(Inst.getType())
+ : getDefaultValue(Inst.getType()));
Inst.eraseFromParent();
}
}
diff --git a/llvm/unittests/ADT/BitTest.cpp b/llvm/unittests/ADT/BitTest.cpp
index eaed4e1..5b3df91 100644
--- a/llvm/unittests/ADT/BitTest.cpp
+++ b/llvm/unittests/ADT/BitTest.cpp
@@ -270,6 +270,22 @@ TEST(BitTest, BitWidthConstexpr) {
llvm::bit_width_constexpr(std::numeric_limits<uint64_t>::max()) == 64);
}
+TEST(BitTest, BitCeilConstexpr) {
+ static_assert(llvm::bit_ceil_constexpr(0u) == 1);
+ static_assert(llvm::bit_ceil_constexpr(1u) == 1);
+ static_assert(llvm::bit_ceil_constexpr(2u) == 2);
+ static_assert(llvm::bit_ceil_constexpr(3u) == 4);
+ static_assert(llvm::bit_ceil_constexpr(4u) == 4);
+ static_assert(llvm::bit_ceil_constexpr(5u) == 8);
+ static_assert(llvm::bit_ceil_constexpr(6u) == 8);
+ static_assert(llvm::bit_ceil_constexpr(7u) == 8);
+ static_assert(llvm::bit_ceil_constexpr(8u) == 8);
+
+ static_assert(llvm::bit_ceil_constexpr(255u) == 256);
+ static_assert(llvm::bit_ceil_constexpr(256u) == 256);
+ static_assert(llvm::bit_ceil_constexpr(257u) == 512);
+}
+
TEST(BitTest, CountlZero) {
uint8_t Z8 = 0;
uint16_t Z16 = 0;
diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
index 3ab2caf..57e15a4 100644
--- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
+++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
@@ -39,20 +39,21 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
"64-i128:128-n32:64-S128-Fn32");
// Check that AMDGPU targets add -G1 if it's not present.
- EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "r600"), "e-p:32:32-G1");
+ EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "r600"), "m:e-e-p:32:32-G1");
// and that ANDGCN adds p7 and p8 as well.
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64", "amdgcn"),
- "e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:"
- "256:256:32");
+ "m:e-e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
+ "192:256:256:32");
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G1", "amdgcn"),
- "e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:"
- "256:256:32");
+ "m:e-e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
+ "192:256:256:32");
// Check that the old AMDGCN p8:128:128 definition is upgraded
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p8:128:128-G1", "amdgcn"),
- "e-p:64:64-p8:128:128:128:48-G1-ni:7:8:9-p7:160:256:256:32-"
- "p9:192:256:256:32");
+ "m:e-e-p:64:64-p8:128:128:128:48-G1-ni:7:8:9-p7:160:256:256:32-p9:"
+ "192:256:256:32");
// but that r600 does not.
- EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G1", "r600"), "e-p:32:32-G1");
+ EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G1", "r600"),
+ "m:e-e-p:32:32-G1");
// Ensure that the non-integral direction for address space 8 doesn't get
// added in to pointer declarations.
@@ -62,11 +63,10 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
"64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-"
"v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7",
"amdgcn"),
- "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-"
- "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:"
+ "m:e-e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:"
+ "64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:"
"1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:"
- "128:48-"
- "p9:192:256:256:32");
+ "128:48-p9:192:256:256:32");
// Check that RISCV64 upgrades -n64 to -n32:64.
EXPECT_EQ(UpgradeDataLayoutString("e-m:e-p:64:64-i64:64-i128:128-n64-S128",
@@ -147,28 +147,29 @@ TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) {
"64-S128-Fn32");
// Check that AMDGPU targets don't add -G1 if there is already a -G flag.
- EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "r600"), "e-p:32:32-G2");
- EXPECT_EQ(UpgradeDataLayoutString("G2", "r600"), "G2");
+ EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "r600"),
+ "m:e-e-p:32:32-G2");
+ EXPECT_EQ(UpgradeDataLayoutString("G2", "r600"), "m:e-G2");
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G2", "amdgcn"),
- "e-p:64:64-G2-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:"
- "256:256:32");
+ "m:e-e-p:64:64-G2-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
+ "192:256:256:32");
EXPECT_EQ(UpgradeDataLayoutString("G2-e-p:64:64", "amdgcn"),
- "G2-e-p:64:64-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:"
- "256:256:32");
+ "m:e-G2-e-p:64:64-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
+ "192:256:256:32");
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G0", "amdgcn"),
- "e-p:64:64-G0-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:"
- "256:256:32");
+ "m:e-e-p:64:64-G0-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
+ "192:256:256:32");
// Check that AMDGCN targets don't add already declared address space 7.
EXPECT_EQ(
UpgradeDataLayoutString("e-p:64:64-p7:64:64", "amdgcn"),
- "e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
+ "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
EXPECT_EQ(
UpgradeDataLayoutString("p7:64:64-G2-e-p:64:64", "amdgcn"),
- "p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
+ "m:e-p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
EXPECT_EQ(
UpgradeDataLayoutString("e-p:64:64-p7:64:64-G1", "amdgcn"),
- "e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
+ "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
// Check that SPIR & SPIRV targets don't add -G1 if there is already a -G
// flag.
@@ -198,10 +199,10 @@ TEST(DataLayoutUpgradeTest, EmptyDataLayout) {
EXPECT_EQ(DL2, "e-m:e-p:32:32-i64:64-f80:128-n8:16:32:64-S128");
// Check that AMDGPU targets add G1 if it's not present.
- EXPECT_EQ(UpgradeDataLayoutString("", "r600"), "G1");
+ EXPECT_EQ(UpgradeDataLayoutString("", "r600"), "m:e-G1");
EXPECT_EQ(
UpgradeDataLayoutString("", "amdgcn"),
- "G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32");
+ "m:e-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32");
// Check that SPIR & SPIRV targets add G1 if it's not present.
EXPECT_EQ(UpgradeDataLayoutString("", "spir"), "G1");
diff --git a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp
index f35a378..686d85d 100644
--- a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp
@@ -43,7 +43,7 @@ protected:
// COFF-ARM64 is not supported yet
auto Triple = JTMB->getTargetTriple();
- if (Triple.isOSBinFormatCOFF() && Triple.isAArch64())
+ if (Triple.isOSBinFormatCOFF())
GTEST_SKIP();
// SystemZ is not supported yet.
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index ed7a4fe..3414190 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -99,6 +99,7 @@ static inline bool inheritsFrom(InstructionContext child,
(noPrefix && inheritsFrom(child, IC_XS, noPrefix)));
case IC_64BIT:
return (inheritsFrom(child, IC_64BIT_REXW) ||
+ inheritsFrom(child, IC_64BIT_REX2) ||
(noPrefix && inheritsFrom(child, IC_64BIT_OPSIZE, noPrefix)) ||
(!AdSize64 && inheritsFrom(child, IC_64BIT_ADSIZE)) ||
(noPrefix && inheritsFrom(child, IC_64BIT_XD, noPrefix)) ||
@@ -151,8 +152,10 @@ static inline bool inheritsFrom(InstructionContext child,
case IC_64BIT_REXW_XS:
case IC_64BIT_REXW_OPSIZE:
case IC_64BIT_REXW_ADSIZE:
- case IC_64BIT_REX2:
+ case IC_64BIT_REX2_REXW:
return false;
+ case IC_64BIT_REX2:
+ return inheritsFrom(child, IC_64BIT_REX2_REXW);
case IC_VEX:
return (VEX_LIG && WIG && inheritsFrom(child, IC_VEX_L_W)) ||
(WIG && inheritsFrom(child, IC_VEX_W)) ||
@@ -980,9 +983,11 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
if ((index & ATTR_EVEXB) && (index & ATTR_EVEXU))
o << "_U";
}
- } else if ((index & ATTR_64BIT) && (index & ATTR_REX2))
+ } else if ((index & ATTR_64BIT) && (index & ATTR_REX2)) {
o << "IC_64BIT_REX2";
- else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XS))
+ if (index & ATTR_REXW)
+ o << "_REXW";
+ } else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XS))
o << "IC_64BIT_REXW_XS";
else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XD))
o << "IC_64BIT_REXW_XD";
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index e87a1c9..a006888 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -365,6 +365,8 @@ InstructionContext RecognizableInstr::insnContext() const {
insnContext = IC_64BIT_XD;
else if (OpPrefix == X86Local::XS)
insnContext = IC_64BIT_XS;
+ else if (HasREX_W && ExplicitREX2Prefix)
+ insnContext = IC_64BIT_REX2_REXW;
else if (ExplicitREX2Prefix)
insnContext = IC_64BIT_REX2;
else if (HasREX_W)
diff --git a/llvm/utils/gn/secondary/lld/test/BUILD.gn b/llvm/utils/gn/secondary/lld/test/BUILD.gn
index dabc578..585e0a4 100644
--- a/llvm/utils/gn/secondary/lld/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/lld/test/BUILD.gn
@@ -1,5 +1,6 @@
import("//llvm/lib/DebugInfo/PDB/enable_dia.gni")
import("//llvm/triples.gni")
+import("//llvm/utils/gn/build/libs/pthread/enable.gni")
import("//llvm/utils/gn/build/libs/xml/enable.gni")
import("//llvm/utils/gn/build/libs/zlib/enable.gni")
import("//llvm/utils/gn/build/libs/zstd/enable.gni")
@@ -88,6 +89,12 @@ write_lit_cfg("lit_site_cfg") {
extra_values += [ "LLVM_ENABLE_LIBXML2=0" ] # Must be 0.
}
+ if (llvm_enable_threads) {
+ extra_values += [ "LLVM_ENABLE_THREADS=1" ]
+ } else {
+ extra_values += [ "LLVM_ENABLE_THREADS=0" ] # Must be 0.
+ }
+
if (llvm_enable_zlib) {
extra_values += [ "LLVM_ENABLE_ZLIB=1" ]
} else {
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 3f8be5e..b570f8d 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -107,6 +107,7 @@ Instrumentation/AddressSanitizer/asan-stack-safety.ll
Instrumentation/AddressSanitizer/asan-struct-scalable.ll
Instrumentation/AddressSanitizer/asan-vp-load-store.ll
Instrumentation/AddressSanitizer/asan-vs-gvn.ll
+Instrumentation/AddressSanitizer/asan-win-dont-instrument-catchpad.ll
Instrumentation/AddressSanitizer/basic.ll
Instrumentation/AddressSanitizer/basic-msvc64.ll
Instrumentation/AddressSanitizer/byref-args.ll
diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat
index 54645d0..001339f 100755..100644
--- a/llvm/utils/release/build_llvm_release.bat
+++ b/llvm/utils/release/build_llvm_release.bat
@@ -156,16 +156,14 @@ set common_cmake_flags=^
-DLLVM_BUILD_LLVM_C_DYLIB=ON ^
-DPython3_FIND_REGISTRY=NEVER ^
-DPACKAGE_VERSION=%package_version% ^
- -DLLDB_RELOCATABLE_PYTHON=1 ^
- -DLLDB_EMBED_PYTHON_HOME=OFF ^
-DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: " ^
-DLLVM_ENABLE_LIBXML2=FORCE_ON ^
- -DLLDB_ENABLE_LIBXML2=OFF ^
-DCLANG_ENABLE_LIBXML2=OFF ^
-DCMAKE_C_FLAGS="%common_compiler_flags%" ^
-DCMAKE_CXX_FLAGS="%common_compiler_flags%" ^
-DLLVM_ENABLE_RPMALLOC=ON ^
- -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;compiler-rt;lldb;openmp"
+ -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld" ^
+ -DLLVM_ENABLE_RUNTIMES="compiler-rt;openmp"
if "%force-msvc%" == "" (
where /q clang-cl
@@ -185,6 +183,11 @@ if "%force-msvc%" == "" (
)
)
+set common_lldb_flags=^
+ -DLLDB_RELOCATABLE_PYTHON=1 ^
+ -DLLDB_EMBED_PYTHON_HOME=OFF ^
+ -DLLDB_ENABLE_LIBXML2=OFF
+
set cmake_profile_flags=""
REM Preserve original path
@@ -192,8 +195,8 @@ set OLDPATH=%PATH%
REM Build the 32-bits and/or 64-bits binaries.
if "%x86%" == "true" call :do_build_32 || exit /b 1
-if "%x64%" == "true" call :do_build_64 || exit /b 1
-if "%arm64%" == "true" call :do_build_arm64 || exit /b 1
+if "%x64%" == "true" call :do_build_64_common amd64 %python64_dir% || exit /b 1
+if "%arm64%" == "true" call :do_build_64_common arm64 %pythonarm64_dir% || exit /b 1
exit /b 0
::==============================================================================
@@ -212,8 +215,6 @@ set "stage0_bin_dir=%build_dir%/build32_stage0/bin"
set cmake_flags=^
%common_cmake_flags% ^
-DLLVM_ENABLE_RPMALLOC=OFF ^
- -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^
- -DPYTHON_HOME=%PYTHONHOME% ^
-DPython3_ROOT_DIR=%PYTHONHOME% ^
-DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
-DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib
@@ -231,6 +232,9 @@ REM CMake expects the paths that specifies the compiler and linker to be
REM with forward slash.
set all_cmake_flags=^
%cmake_flags% ^
+ -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;lldb;" ^
+ %common_lldb_flags% ^
+ -DPYTHON_HOME=%PYTHONHOME% ^
-DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
-DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
-DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^
@@ -254,32 +258,42 @@ exit /b 0
::==============================================================================
::==============================================================================
-:: Build 64-bits binaries.
+:: Build 64-bits binaries (common function for both x64 and arm64)
::==============================================================================
-:do_build_64
-call :set_environment %python64_dir% || exit /b 1
-call "%vsdevcmd%" -arch=amd64 || exit /b 1
+:do_build_64_common
+set arch=%1
+set python_dir=%2
+
+call :set_environment %python_dir% || exit /b 1
+call "%vsdevcmd%" -arch=%arch% || exit /b 1
@echo on
-mkdir build64_stage0
-cd build64_stage0
+mkdir build_%arch%_stage0
+cd build_%arch%_stage0
call :do_build_libxml || exit /b 1
REM Stage0 binaries directory; used in stage1.
-set "stage0_bin_dir=%build_dir%/build64_stage0/bin"
+set "stage0_bin_dir=%build_dir%/build_%arch%_stage0/bin"
set cmake_flags=^
%common_cmake_flags% ^
- -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^
- -DPYTHON_HOME=%PYTHONHOME% ^
-DPython3_ROOT_DIR=%PYTHONHOME% ^
-DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
- -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib
+ -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib ^
+ -DCLANG_DEFAULT_LINKER=lld
+if "%arch%"=="arm64" (
+ set cmake_flags=%cmake_flags% ^
+ -DCOMPILER_RT_BUILD_SANITIZERS=OFF
+)
-cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1
+cmake -GNinja %cmake_flags% ^
+ -DLLVM_TARGETS_TO_BUILD=Native ^
+ %llvm_src%\llvm || exit /b 1
ninja || ninja || ninja || exit /b 1
ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
+if "%arch%"=="amd64" (
+ ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1
+)
ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1
cd..
@@ -293,24 +307,40 @@ set all_cmake_flags=^
-DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^
-DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^
-DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe
+if "%arch%"=="arm64" (
+ set all_cmake_flags=%all_cmake_flags% ^
+ -DCPACK_SYSTEM_NAME=woa64
+)
set cmake_flags=%all_cmake_flags:\=/%
-
-mkdir build64
-cd build64
+mkdir build_%arch%
+cd build_%arch%
call :do_generate_profile || exit /b 1
-cmake -GNinja %cmake_flags% %cmake_profile_flags% %llvm_src%\llvm || exit /b 1
+cmake -GNinja %cmake_flags% ^
+ -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;lldb;flang;mlir" ^
+ %common_lldb_flags% ^
+ -DPYTHON_HOME=%PYTHONHOME% ^
+ %cmake_profile_flags% %llvm_src%\llvm || exit /b 1
ninja || ninja || ninja || exit /b 1
ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
+if "%arch%"=="amd64" (
+ ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1
+)
ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1
+REM ninja check-flang || ninja check-flang || ninja check-flang || exit /b 1
+REM ninja check-mlir || ninja check-mlir || ninja check-mlir || exit /b 1
+REM ninja check-lldb || ninja check-lldb || ninja check-lldb || exit /b 1
ninja package || exit /b 1
:: generate tarball with install toolchain only off
-set filename=clang+llvm-%version%-x86_64-pc-windows-msvc
+if "%arch%"=="amd64" (
+ set filename=clang+llvm-%version%-x86_64-pc-windows-msvc
+) else (
+ set filename=clang+llvm-%version%-aarch64-pc-windows-msvc
+)
cmake -GNinja %cmake_flags% %cmake_profile_flags% -DLLVM_INSTALL_TOOLCHAIN_ONLY=OFF ^
-DCMAKE_INSTALL_PREFIX=%build_dir%/%filename% ..\llvm-project\llvm || exit /b 1
ninja install || exit /b 1
@@ -320,75 +350,7 @@ cd ..
7z a -ttar -so %filename%.tar %filename% | 7z a -txz -si %filename%.tar.xz
exit /b 0
-::==============================================================================
-
-::==============================================================================
-:: Build arm64 binaries.
-::==============================================================================
-:do_build_arm64
-call :set_environment %pythonarm64_dir% || exit /b 1
-call "%vsdevcmd%" -host_arch=x64 -arch=arm64 || exit /b 1
-@echo on
-mkdir build_arm64_stage0
-cd build_arm64_stage0
-call :do_build_libxml || exit /b 1
-
-REM Stage0 binaries directory; used in stage1.
-set "stage0_bin_dir=%build_dir%/build_arm64_stage0/bin"
-set cmake_flags=^
- %common_cmake_flags% ^
- -DCLANG_DEFAULT_LINKER=lld ^
- -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
- -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib ^
- -DPython3_ROOT_DIR=%PYTHONHOME% ^
- -DCOMPILER_RT_BUILD_PROFILE=OFF ^
- -DCOMPILER_RT_BUILD_SANITIZERS=OFF
-
-REM We need to build stage0 compiler-rt with clang-cl (msvc lacks some builtins).
-cmake -GNinja %cmake_flags% ^
- -DCMAKE_C_COMPILER=clang-cl.exe ^
- -DCMAKE_CXX_COMPILER=clang-cl.exe ^
- %llvm_src%\llvm || exit /b 1
-ninja || exit /b 1
-::ninja check-llvm || exit /b 1
-::ninja check-clang || exit /b 1
-::ninja check-lld || exit /b 1
-::ninja check-sanitizer || exit /b 1
-::ninja check-clang-tools || exit /b 1
-::ninja check-clangd || exit /b 1
-cd..
-
-REM CMake expects the paths that specifies the compiler and linker to be
-REM with forward slash.
-REM CPACK_SYSTEM_NAME is set to have a correct name for installer generated.
-set all_cmake_flags=^
- %cmake_flags% ^
- -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
- -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
- -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^
- -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^
- -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe ^
- -DCPACK_SYSTEM_NAME=woa64
-set cmake_flags=%all_cmake_flags:\=/%
-mkdir build_arm64
-cd build_arm64
-cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1
-ninja || exit /b 1
-REM Check but do not fail on errors.
-ninja check-lldb
-::ninja check-llvm || exit /b 1
-::ninja check-clang || exit /b 1
-::ninja check-lld || exit /b 1
-::ninja check-sanitizer || exit /b 1
-::ninja check-clang-tools || exit /b 1
-::ninja check-clangd || exit /b 1
-ninja package || exit /b 1
-cd ..
-
-exit /b 0
-::==============================================================================
-::
::==============================================================================
:: Set PATH and some environment variables.
::==============================================================================